diff options
Diffstat (limited to 'fs')
697 files changed, 16426 insertions, 6946 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile index 9619ccadd2fc..e7800a5c7395 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_9P_FS) := 9p.o 9p-objs := \ diff --git a/fs/Makefile b/fs/Makefile index 7bbaca9c67b1..ef772f1eaff8 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux filesystems. # diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index fadf408bdd46..c76db75f02aa 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fs.h> #include <linux/adfs_fs.h> diff --git a/fs/adfs/file.c b/fs/adfs/file.c index 46c0d5671cd5..754afb14a6ff 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/adfs/file.c * diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 773749be8290..a92eb6ae2ae2 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef pr_fmt #undef pr_fmt #endif diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 8cf941c3b511..185d5ab7e986 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/affs/amigaffs.c * diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h index 43b41c06aa37..f9bef9056659 100644 --- a/fs/affs/amigaffs.h +++ b/fs/affs/amigaffs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef AMIGAFFS_H #define AMIGAFFS_H diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index 2b2112475ec2..2b1399611d9e 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/affs/bitmap.c * diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 591ecd7f3063..a105e77df2c1 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/affs/dir.c * diff --git a/fs/affs/file.c b/fs/affs/file.c index 00331810f690..a85817f54483 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/affs/file.c * diff --git a/fs/affs/inode.c b/fs/affs/inode.c index fd4ef3c40e40..73598bff8506 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/affs/inode.c * diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 46d3ace6761d..d8aa0ae3d037 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/affs/namei.c * diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index ae622cdce142..a7531b26e8f0 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/affs/symlink.c * diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 095c54165dfd..641148208e90 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for Red Hat Linux AFS client. # diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c index 40b2bab3e401..50bd5bb1c4fb 100644 --- a/fs/afs/netdevices.c +++ b/fs/afs/netdevices.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* AFS network device helpers * * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> @@ -576,7 +576,7 @@ static int kiocb_cancel(struct aio_kiocb *kiocb) * actually has a cancel function, hence the cmpxchg() */ - cancel = ACCESS_ONCE(kiocb->ki_cancel); + cancel = READ_ONCE(kiocb->ki_cancel); do { if (!cancel || cancel == KIOCB_CANCELLED) return -EINVAL; diff --git a/fs/attr.c b/fs/attr.c index 135304146120..12ffdb6fb63c 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/attr.c * diff --git a/fs/bad_inode.c b/fs/bad_inode.c index bb53728c7a31..213b51dbbb60 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * diff --git a/fs/befs/befs.h b/fs/befs/befs.h index b914cfb03820..7cd47245694d 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * befs.h * diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h index 69c9d8cde955..8019fde814b7 100644 --- a/fs/befs/befs_fs_types.h +++ b/fs/befs/befs_fs_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/befs/befs_fs_types.h * diff --git a/fs/befs/btree.h b/fs/befs/btree.h index 60c6c728e64e..a253a6276d8e 100644 --- a/fs/befs/btree.h +++ b/fs/befs/btree.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * btree.h * diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index 720b3bc5c16a..97719a7c7e40 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/befs/datastream.c * diff --git a/fs/befs/datastream.h b/fs/befs/datastream.h index 7ff9ff09ec6e..39b1d4766ccf 100644 --- a/fs/befs/datastream.h +++ b/fs/befs/datastream.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * datastream.h * diff --git a/fs/befs/debug.c b/fs/befs/debug.c index 36656c86f50e..eb7bd6c692c7 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/befs/debug.c * diff --git a/fs/befs/endian.h b/fs/befs/endian.h index 27223878ba9f..bb55a54c24c0 100644 --- a/fs/befs/endian.h +++ b/fs/befs/endian.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/befs/endian.h * diff --git a/fs/befs/inode.c b/fs/befs/inode.c index 5367a6470a69..791b46a6f2f9 100644 --- a/fs/befs/inode.c +++ b/fs/befs/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * inode.c * diff --git a/fs/befs/io.c b/fs/befs/io.c index 227cb86e07fe..2caf50a4abbe 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/befs/io.c * diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h index f40006db36df..67aef3bb89e4 100644 --- a/fs/bfs/bfs.h +++ b/fs/bfs/bfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/bfs/bfs.h * Copyright (C) 1999 Tigran Aivazian <tigran@veritas.com> diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 3e5ac30e8b6f..ee832ca5f734 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/bfs/dir.c * BFS directory operations. diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 97f1b5160155..1476cdd90cfb 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/bfs/file.c * BFS file operations. diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 475d083f8088..5d6b94475f27 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /****************************************************************************/ /* * linux/fs/binfmt_flat.c diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 2a46762def31..a7c5a9861bef 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -596,7 +596,7 @@ static void bm_evict_inode(struct inode *inode) { Node *e = inode->i_private; - if (e->flags & MISC_FMT_OPEN_FILE) + if (e && e->flags & MISC_FMT_OPEN_FILE) filp_close(e->interp_file, NULL); clear_inode(inode); diff --git a/fs/block_dev.c b/fs/block_dev.c index 93d088ffc05c..789f55e851ae 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -716,10 +716,12 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true); - if (result) + if (result) { end_page_writeback(page); - else + } else { + clean_page_buffers(page); unlock_page(page); + } blk_queue_exit(bdev->bd_queue); return result; } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index a26c63b4ad68..2e558227931a 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -91,3 +91,14 @@ config BTRFS_ASSERT any of the assertions trip. This is meant for btrfs developers only. If unsure, say N. + +config BTRFS_FS_REF_VERIFY + bool "Btrfs with the ref verify tool compiled in" + depends on BTRFS_FS + default n + help + Enable run-time extent reference verification instrumentation. This + is meant to be used by btrfs developers for tracking down extent + reference problems or verifying they didn't break something. + + If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 962a95aefb81..6fe881d5cb38 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_BTRFS_FS) := btrfs.o @@ -9,10 +10,11 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o hash.o free-space-tree.o + uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o +btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index e00c8a9fd5bb..d5540749f0e5 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -67,7 +67,7 @@ struct btrfs_workqueue { static void normal_work_helper(struct btrfs_work *work); #define BTRFS_WORK_HELPER(name) \ -void btrfs_##name(struct work_struct *arg) \ +noinline_for_stack void btrfs_##name(struct work_struct *arg) \ { \ struct btrfs_work *work = container_of(arg, struct btrfs_work, \ normal_work); \ diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b517ef1477ea..7d0dc100a09a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -40,12 +40,14 @@ static int check_extent_in_eb(const struct btrfs_key *key, const struct extent_buffer *eb, const struct btrfs_file_extent_item *fi, u64 extent_item_pos, - struct extent_inode_elem **eie) + struct extent_inode_elem **eie, + bool ignore_offset) { u64 offset = 0; struct extent_inode_elem *e; - if (!btrfs_file_extent_compression(eb, fi) && + if (!ignore_offset && + !btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; @@ -84,7 +86,8 @@ static void free_inode_elem_list(struct extent_inode_elem *eie) static int find_extent_in_eb(const struct extent_buffer *eb, u64 wanted_disk_byte, u64 extent_item_pos, - struct extent_inode_elem **eie) + struct extent_inode_elem **eie, + bool ignore_offset) { u64 disk_byte; struct btrfs_key key; @@ -113,7 +116,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb, if (disk_byte != wanted_disk_byte) continue; - ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset); if (ret < 0) return ret; } @@ -419,7 +422,7 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct prelim_ref *ref, int level, u64 time_seq, const u64 *extent_item_pos, - u64 total_refs) + u64 total_refs, bool ignore_offset) { int ret = 0; int slot; @@ -472,7 +475,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, - &eie); + &eie, ignore_offset); if (ret < 0) break; } @@ -510,7 +513,8 @@ next: static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, u64 total_refs) + const u64 *extent_item_pos, u64 total_refs, + bool ignore_offset) { struct btrfs_root *root; struct btrfs_key root_key; @@ -581,7 +585,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, } ret = add_all_parents(root, path, parents, ref, level, time_seq, - extent_item_pos, total_refs); + extent_item_pos, total_refs, ignore_offset); out: path->lowest_level = 0; btrfs_release_path(path); @@ -616,7 +620,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, const u64 *extent_item_pos, u64 total_refs, - struct share_check *sc) + struct share_check *sc, bool ignore_offset) { int err; int ret = 0; @@ -661,7 +665,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, } err = resolve_indirect_ref(fs_info, path, time_seq, ref, parents, extent_item_pos, - total_refs); + total_refs, ignore_offset); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -769,6 +773,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_key tmp_op_key; struct btrfs_key *op_key = NULL; + struct rb_node *n; int count; int ret = 0; @@ -778,7 +783,9 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, } spin_lock(&head->lock); - list_for_each_entry(node, &head->ref_list, list) { + for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) { + node = rb_entry(n, struct btrfs_delayed_ref_node, + ref_node); if (node->seq > seq) continue; @@ -1107,13 +1114,17 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, * * Otherwise this returns 0 for success and <0 for an error. * + * If ignore_offset is set to false, only extent refs whose offsets match + * extent_item_pos are returned. If true, every extent ref is returned + * and extent_item_pos is ignored. + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist *refs, struct ulist *roots, const u64 *extent_item_pos, - struct share_check *sc) + struct share_check *sc, bool ignore_offset) { struct btrfs_key key; struct btrfs_path *path; @@ -1178,7 +1189,7 @@ again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -1189,7 +1200,7 @@ again: */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); goto again; } spin_unlock(&delayed_refs->lock); @@ -1235,7 +1246,7 @@ again: WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, total_refs, sc); + extent_item_pos, total_refs, sc, ignore_offset); if (ret) goto out; @@ -1282,7 +1293,7 @@ again: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); ret = find_extent_in_eb(eb, bytenr, - *extent_item_pos, &eie); + *extent_item_pos, &eie, ignore_offset); btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret < 0) @@ -1350,7 +1361,7 @@ static void free_leaf_list(struct ulist *blocks) static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos) + const u64 *extent_item_pos, bool ignore_offset) { int ret; @@ -1359,7 +1370,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - *leafs, NULL, extent_item_pos, NULL); + *leafs, NULL, extent_item_pos, NULL, ignore_offset); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1383,7 +1394,8 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) + u64 time_seq, struct ulist **roots, + bool ignore_offset) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1402,7 +1414,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - tmp, *roots, NULL, NULL); + tmp, *roots, NULL, NULL, ignore_offset); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); @@ -1421,14 +1433,15 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) + u64 time_seq, struct ulist **roots, + bool ignore_offset) { int ret; if (!trans) down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, - time_seq, roots); + time_seq, roots, ignore_offset); if (!trans) up_read(&fs_info->commit_root_sem); return ret; @@ -1483,7 +1496,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, - roots, NULL, &shared); + roots, NULL, &shared, false); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; @@ -1877,7 +1890,8 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, u64 extent_item_pos, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx) + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset) { int ret; struct btrfs_trans_handle *trans = NULL; @@ -1903,14 +1917,15 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, tree_mod_seq_elem.seq, &refs, - &extent_item_pos); + &extent_item_pos, ignore_offset); if (ret) goto out; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots); + tree_mod_seq_elem.seq, &roots, + ignore_offset); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -1943,7 +1958,8 @@ out: int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx) + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset) { int ret; u64 extent_item_pos; @@ -1961,7 +1977,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - iterate, ctx); + iterate, ctx, ignore_offset); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e410335841aa..0c2fab8514ff 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -43,17 +43,19 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, u64 extent_offset, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx); + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx); + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots); + u64 time_seq, struct ulist **roots, bool ignore_offset); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index eccadb5f62a5..63f0ccc92a71 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -36,14 +36,13 @@ #define BTRFS_INODE_ORPHAN_META_RESERVED 1 #define BTRFS_INODE_DUMMY 2 #define BTRFS_INODE_IN_DEFRAG 3 -#define BTRFS_INODE_DELALLOC_META_RESERVED 4 -#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 -#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 -#define BTRFS_INODE_NEEDS_FULL_SYNC 7 -#define BTRFS_INODE_COPY_EVERYTHING 8 -#define BTRFS_INODE_IN_DELALLOC_LIST 9 -#define BTRFS_INODE_READDIO_NEED_LOCK 10 -#define BTRFS_INODE_HAS_PROPS 11 +#define BTRFS_INODE_HAS_ORPHAN_ITEM 4 +#define BTRFS_INODE_HAS_ASYNC_EXTENT 5 +#define BTRFS_INODE_NEEDS_FULL_SYNC 6 +#define BTRFS_INODE_COPY_EVERYTHING 7 +#define BTRFS_INODE_IN_DELALLOC_LIST 8 +#define BTRFS_INODE_READDIO_NEED_LOCK 9 +#define BTRFS_INODE_HAS_PROPS 10 /* in memory btrfs inode */ struct btrfs_inode { @@ -176,7 +175,8 @@ struct btrfs_inode { * of extent items we've reserved metadata for. */ unsigned outstanding_extents; - unsigned reserved_extents; + + struct btrfs_block_rsv block_rsv; /* * Cached values of inode properties @@ -267,6 +267,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) return false; } +static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, + int mod) +{ + lockdep_assert_held(&inode->lock); + inode->outstanding_extents += mod; + if (btrfs_is_free_space_inode(inode)) + return; + trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), + mod); +} + static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { int ret = 0; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7d5a9b51f0d7..7d51b5a5b505 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -613,7 +613,7 @@ static void btrfsic_dev_state_hashtable_add( struct btrfsic_dev_state_hashtable *h) { const unsigned int hashval = - (((unsigned int)((uintptr_t)ds->bdev)) & + (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); list_add(&ds->collision_resolving_node, h->table + hashval); @@ -2803,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio_dev(bio)); + dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i = 0; @@ -2913,7 +2913,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, state = kvzalloc(sizeof(*state), GFP_KERNEL); if (!state) { pr_info("btrfs check-integrity: allocation failed!\n"); - return -1; + return -ENOMEM; } if (!btrfsic_is_initialized) { @@ -2945,7 +2945,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, if (NULL == ds) { pr_info("btrfs check-integrity: kmalloc() failed!\n"); mutex_unlock(&btrfsic_mutex); - return -1; + return -ENOMEM; } ds->bdev = device->bdev; ds->state = state; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 280384bf34f1..b35ce16b3df3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -33,6 +33,8 @@ #include <linux/bit_spinlock.h> #include <linux/slab.h> #include <linux/sched/mm.h> +#include <linux/sort.h> +#include <linux/log2.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -255,7 +257,8 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_status ? 0 : 1); + bio->bi_status ? + BLK_STS_OK : BLK_STS_NOTSUPP); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -706,7 +709,86 @@ out: return ret; } -static struct { +/* + * Heuristic uses systematic sampling to collect data from the input data + * range, the logic can be tuned by the following constants: + * + * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample + * @SAMPLING_INTERVAL - range from which the sampled data can be collected + */ +#define SAMPLING_READ_SIZE (16) +#define SAMPLING_INTERVAL (256) + +/* + * For statistical analysis of the input data we consider bytes that form a + * Galois Field of 256 objects. Each object has an attribute count, ie. how + * many times the object appeared in the sample. + */ +#define BUCKET_SIZE (256) + +/* + * The size of the sample is based on a statistical sampling rule of thumb. + * The common way is to perform sampling tests as long as the number of + * elements in each cell is at least 5. + * + * Instead of 5, we choose 32 to obtain more accurate results. + * If the data contain the maximum number of symbols, which is 256, we obtain a + * sample size bound by 8192. + * + * For a sample of at most 8KB of data per data range: 16 consecutive bytes + * from up to 512 locations. + */ +#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ + SAMPLING_READ_SIZE / SAMPLING_INTERVAL) + +struct bucket_item { + u32 count; +}; + +struct heuristic_ws { + /* Partial copy of input data */ + u8 *sample; + u32 sample_size; + /* Buckets store counters for each byte value */ + struct bucket_item *bucket; + struct list_head list; +}; + +static void free_heuristic_ws(struct list_head *ws) +{ + struct heuristic_ws *workspace; + + workspace = list_entry(ws, struct heuristic_ws, list); + + kvfree(workspace->sample); + kfree(workspace->bucket); + kfree(workspace); +} + +static struct list_head *alloc_heuristic_ws(void) +{ + struct heuristic_ws *ws; + + ws = kzalloc(sizeof(*ws), GFP_KERNEL); + if (!ws) + return ERR_PTR(-ENOMEM); + + ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); + if (!ws->sample) + goto fail; + + ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); + if (!ws->bucket) + goto fail; + + INIT_LIST_HEAD(&ws->list); + return &ws->list; +fail: + free_heuristic_ws(&ws->list); + return ERR_PTR(-ENOMEM); +} + +struct workspaces_list { struct list_head idle_ws; spinlock_t ws_lock; /* Number of free workspaces */ @@ -715,7 +797,11 @@ static struct { atomic_t total_ws; /* Waiters for a free workspace */ wait_queue_head_t ws_wait; -} btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; +}; + +static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; + +static struct workspaces_list btrfs_heuristic_ws; static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, @@ -725,11 +811,25 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { void __init btrfs_init_compress(void) { + struct list_head *workspace; int i; - for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - struct list_head *workspace; + INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws); + spin_lock_init(&btrfs_heuristic_ws.ws_lock); + atomic_set(&btrfs_heuristic_ws.total_ws, 0); + init_waitqueue_head(&btrfs_heuristic_ws.ws_wait); + + workspace = alloc_heuristic_ws(); + if (IS_ERR(workspace)) { + pr_warn( + "BTRFS: cannot preallocate heuristic workspace, will try later\n"); + } else { + atomic_set(&btrfs_heuristic_ws.total_ws, 1); + btrfs_heuristic_ws.free_ws = 1; + list_add(workspace, &btrfs_heuristic_ws.idle_ws); + } + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); atomic_set(&btrfs_comp_ws[i].total_ws, 0); @@ -756,18 +856,32 @@ void __init btrfs_init_compress(void) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -static struct list_head *find_workspace(int type) +static struct list_head *__find_workspace(int type, bool heuristic) { struct list_head *workspace; int cpus = num_online_cpus(); int idx = type - 1; unsigned nofs_flag; + struct list_head *idle_ws; + spinlock_t *ws_lock; + atomic_t *total_ws; + wait_queue_head_t *ws_wait; + int *free_ws; + + if (heuristic) { + idle_ws = &btrfs_heuristic_ws.idle_ws; + ws_lock = &btrfs_heuristic_ws.ws_lock; + total_ws = &btrfs_heuristic_ws.total_ws; + ws_wait = &btrfs_heuristic_ws.ws_wait; + free_ws = &btrfs_heuristic_ws.free_ws; + } else { + idle_ws = &btrfs_comp_ws[idx].idle_ws; + ws_lock = &btrfs_comp_ws[idx].ws_lock; + total_ws = &btrfs_comp_ws[idx].total_ws; + ws_wait = &btrfs_comp_ws[idx].ws_wait; + free_ws = &btrfs_comp_ws[idx].free_ws; + } - struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; - spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; - wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *free_ws = &btrfs_comp_ws[idx].free_ws; again: spin_lock(ws_lock); if (!list_empty(idle_ws)) { @@ -797,7 +911,10 @@ again: * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - workspace = btrfs_compress_op[idx]->alloc_workspace(); + if (heuristic) + workspace = alloc_heuristic_ws(); + else + workspace = btrfs_compress_op[idx]->alloc_workspace(); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -828,18 +945,38 @@ again: return workspace; } +static struct list_head *find_workspace(int type) +{ + return __find_workspace(type, false); +} + /* * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -static void free_workspace(int type, struct list_head *workspace) +static void __free_workspace(int type, struct list_head *workspace, + bool heuristic) { int idx = type - 1; - struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; - spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; - wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *free_ws = &btrfs_comp_ws[idx].free_ws; + struct list_head *idle_ws; + spinlock_t *ws_lock; + atomic_t *total_ws; + wait_queue_head_t *ws_wait; + int *free_ws; + + if (heuristic) { + idle_ws = &btrfs_heuristic_ws.idle_ws; + ws_lock = &btrfs_heuristic_ws.ws_lock; + total_ws = &btrfs_heuristic_ws.total_ws; + ws_wait = &btrfs_heuristic_ws.ws_wait; + free_ws = &btrfs_heuristic_ws.free_ws; + } else { + idle_ws = &btrfs_comp_ws[idx].idle_ws; + ws_lock = &btrfs_comp_ws[idx].ws_lock; + total_ws = &btrfs_comp_ws[idx].total_ws; + ws_wait = &btrfs_comp_ws[idx].ws_wait; + free_ws = &btrfs_comp_ws[idx].free_ws; + } spin_lock(ws_lock); if (*free_ws <= num_online_cpus()) { @@ -850,7 +987,10 @@ static void free_workspace(int type, struct list_head *workspace) } spin_unlock(ws_lock); - btrfs_compress_op[idx]->free_workspace(workspace); + if (heuristic) + free_heuristic_ws(workspace); + else + btrfs_compress_op[idx]->free_workspace(workspace); atomic_dec(total_ws); wake: /* @@ -861,6 +1001,11 @@ wake: wake_up(ws_wait); } +static void free_workspace(int type, struct list_head *ws) +{ + return __free_workspace(type, ws, false); +} + /* * cleanup function for module exit */ @@ -869,6 +1014,13 @@ static void free_workspaces(void) struct list_head *workspace; int i; + while (!list_empty(&btrfs_heuristic_ws.idle_ws)) { + workspace = btrfs_heuristic_ws.idle_ws.next; + list_del(workspace); + free_heuristic_ws(workspace); + atomic_dec(&btrfs_heuristic_ws.total_ws); + } + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { workspace = btrfs_comp_ws[i].idle_ws.next; @@ -883,6 +1035,11 @@ static void free_workspaces(void) * Given an address space and start and length, compress the bytes into @pages * that are allocated on demand. * + * @type_level is encoded algorithm and level, where level 0 means whatever + * default the algorithm chooses and is opaque here; + * - compression algo are 0-3 + * - the level are bits 4-7 + * * @out_pages is an in/out parameter, holds maximum number of pages to allocate * and returns number of actually allocated pages * @@ -897,7 +1054,7 @@ static void free_workspaces(void) * @max_out tells us the max number of bytes that we're allowed to * stuff into pages */ -int btrfs_compress_pages(int type, struct address_space *mapping, +int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, @@ -905,9 +1062,11 @@ int btrfs_compress_pages(int type, struct address_space *mapping, { struct list_head *workspace; int ret; + int type = type_level & 0xF; workspace = find_workspace(type); + btrfs_compress_op[type - 1]->set_level(workspace, type_level); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, pages, out_pages, @@ -1066,6 +1225,211 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, } /* + * Shannon Entropy calculation + * + * Pure byte distribution analysis fails to determine compressiability of data. + * Try calculating entropy to estimate the average minimum number of bits + * needed to encode the sampled data. + * + * For convenience, return the percentage of needed bits, instead of amount of + * bits directly. + * + * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy + * and can be compressible with high probability + * + * @ENTROPY_LVL_HIGH - data are not compressible with high probability + * + * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate. + */ +#define ENTROPY_LVL_ACEPTABLE (65) +#define ENTROPY_LVL_HIGH (80) + +/* + * For increasead precision in shannon_entropy calculation, + * let's do pow(n, M) to save more digits after comma: + * + * - maximum int bit length is 64 + * - ilog2(MAX_SAMPLE_SIZE) -> 13 + * - 13 * 4 = 52 < 64 -> M = 4 + * + * So use pow(n, 4). + */ +static inline u32 ilog2_w(u64 n) +{ + return ilog2(n * n * n * n); +} + +static u32 shannon_entropy(struct heuristic_ws *ws) +{ + const u32 entropy_max = 8 * ilog2_w(2); + u32 entropy_sum = 0; + u32 p, p_base, sz_base; + u32 i; + + sz_base = ilog2_w(ws->sample_size); + for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) { + p = ws->bucket[i].count; + p_base = ilog2_w(p); + entropy_sum += p * (sz_base - p_base); + } + + entropy_sum /= ws->sample_size; + return entropy_sum * 100 / entropy_max; +} + +/* Compare buckets by size, ascending */ +static int bucket_comp_rev(const void *lv, const void *rv) +{ + const struct bucket_item *l = (const struct bucket_item *)lv; + const struct bucket_item *r = (const struct bucket_item *)rv; + + return r->count - l->count; +} + +/* + * Size of the core byte set - how many bytes cover 90% of the sample + * + * There are several types of structured binary data that use nearly all byte + * values. The distribution can be uniform and counts in all buckets will be + * nearly the same (eg. encrypted data). Unlikely to be compressible. + * + * Other possibility is normal (Gaussian) distribution, where the data could + * be potentially compressible, but we have to take a few more steps to decide + * how much. + * + * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently, + * compression algo can easy fix that + * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high + * probability is not compressible + */ +#define BYTE_CORE_SET_LOW (64) +#define BYTE_CORE_SET_HIGH (200) + +static int byte_core_set_size(struct heuristic_ws *ws) +{ + u32 i; + u32 coreset_sum = 0; + const u32 core_set_threshold = ws->sample_size * 90 / 100; + struct bucket_item *bucket = ws->bucket; + + /* Sort in reverse order */ + sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL); + + for (i = 0; i < BYTE_CORE_SET_LOW; i++) + coreset_sum += bucket[i].count; + + if (coreset_sum > core_set_threshold) + return i; + + for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { + coreset_sum += bucket[i].count; + if (coreset_sum > core_set_threshold) + break; + } + + return i; +} + +/* + * Count byte values in buckets. + * This heuristic can detect textual data (configs, xml, json, html, etc). + * Because in most text-like data byte set is restricted to limited number of + * possible characters, and that restriction in most cases makes data easy to + * compress. + * + * @BYTE_SET_THRESHOLD - consider all data within this byte set size: + * less - compressible + * more - need additional analysis + */ +#define BYTE_SET_THRESHOLD (64) + +static u32 byte_set_size(const struct heuristic_ws *ws) +{ + u32 i; + u32 byte_set_size = 0; + + for (i = 0; i < BYTE_SET_THRESHOLD; i++) { + if (ws->bucket[i].count > 0) + byte_set_size++; + } + + /* + * Continue collecting count of byte values in buckets. If the byte + * set size is bigger then the threshold, it's pointless to continue, + * the detection technique would fail for this type of data. + */ + for (; i < BUCKET_SIZE; i++) { + if (ws->bucket[i].count > 0) { + byte_set_size++; + if (byte_set_size > BYTE_SET_THRESHOLD) + return byte_set_size; + } + } + + return byte_set_size; +} + +static bool sample_repeated_patterns(struct heuristic_ws *ws) +{ + const u32 half_of_sample = ws->sample_size / 2; + const u8 *data = ws->sample; + + return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0; +} + +static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, + struct heuristic_ws *ws) +{ + struct page *page; + u64 index, index_end; + u32 i, curr_sample_pos; + u8 *in_data; + + /* + * Compression handles the input data by chunks of 128KiB + * (defined by BTRFS_MAX_UNCOMPRESSED) + * + * We do the same for the heuristic and loop over the whole range. + * + * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will + * process no more than BTRFS_MAX_UNCOMPRESSED at a time. + */ + if (end - start > BTRFS_MAX_UNCOMPRESSED) + end = start + BTRFS_MAX_UNCOMPRESSED; + + index = start >> PAGE_SHIFT; + index_end = end >> PAGE_SHIFT; + + /* Don't miss unaligned end */ + if (!IS_ALIGNED(end, PAGE_SIZE)) + index_end++; + + curr_sample_pos = 0; + while (index < index_end) { + page = find_get_page(inode->i_mapping, index); + in_data = kmap(page); + /* Handle case where the start is not aligned to PAGE_SIZE */ + i = start % PAGE_SIZE; + while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { + /* Don't sample any garbage from the last page */ + if (start > end - SAMPLING_READ_SIZE) + break; + memcpy(&ws->sample[curr_sample_pos], &in_data[i], + SAMPLING_READ_SIZE); + i += SAMPLING_INTERVAL; + start += SAMPLING_INTERVAL; + curr_sample_pos += SAMPLING_READ_SIZE; + } + kunmap(page); + put_page(page); + + index++; + } + + ws->sample_size = curr_sample_pos; +} + +/* * Compression heuristic. * * For now is's a naive and optimistic 'return true', we'll extend the logic to @@ -1082,18 +1446,87 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, */ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) { - u64 index = start >> PAGE_SHIFT; - u64 end_index = end >> PAGE_SHIFT; - struct page *page; - int ret = 1; + struct list_head *ws_list = __find_workspace(0, true); + struct heuristic_ws *ws; + u32 i; + u8 byte; + int ret = 0; - while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - kmap(page); - kunmap(page); - put_page(page); - index++; + ws = list_entry(ws_list, struct heuristic_ws, list); + + heuristic_collect_sample(inode, start, end, ws); + + if (sample_repeated_patterns(ws)) { + ret = 1; + goto out; + } + + memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE); + + for (i = 0; i < ws->sample_size; i++) { + byte = ws->sample[i]; + ws->bucket[byte].count++; + } + + i = byte_set_size(ws); + if (i < BYTE_SET_THRESHOLD) { + ret = 2; + goto out; + } + + i = byte_core_set_size(ws); + if (i <= BYTE_CORE_SET_LOW) { + ret = 3; + goto out; } + if (i >= BYTE_CORE_SET_HIGH) { + ret = 0; + goto out; + } + + i = shannon_entropy(ws); + if (i <= ENTROPY_LVL_ACEPTABLE) { + ret = 4; + goto out; + } + + /* + * For the levels below ENTROPY_LVL_HIGH, additional analysis would be + * needed to give green light to compression. + * + * For now just assume that compression at that level is not worth the + * resources because: + * + * 1. it is possible to defrag the data later + * + * 2. the data would turn out to be hardly compressible, eg. 150 byte + * values, every bucket has counter at level ~54. The heuristic would + * be confused. This can happen when data have some internal repeated + * patterns like "abbacbbc...". This can be detected by analyzing + * pairs of bytes, which is too costly. + */ + if (i < ENTROPY_LVL_HIGH) { + ret = 5; + goto out; + } else { + ret = 0; + goto out; + } + +out: + __free_workspace(0, ws_list, true); return ret; } + +unsigned int btrfs_compress_str2level(const char *str) +{ + if (strncmp(str, "zlib", 4) != 0) + return 0; + + /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */ + if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) + return str[5] - '0'; + + return 0; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d2781ff8f994..da20755ebf21 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -76,7 +76,7 @@ struct compressed_bio { void btrfs_init_compress(void); void btrfs_exit_compress(void); -int btrfs_compress_pages(int type, struct address_space *mapping, +int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, @@ -95,6 +95,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); +unsigned btrfs_compress_str2level(const char *str); + enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, @@ -124,6 +126,8 @@ struct btrfs_compress_op { struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); + + void (*set_level)(struct list_head *ws, unsigned int type); }; extern const struct btrfs_compress_op btrfs_zlib_compress; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6d49db7d86be..531e0a8645b0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -192,7 +192,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * tree until you end up with a lock on the root. A locked buffer * is returned, with a reference held. */ -static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; @@ -5496,8 +5496,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } else if (left_end_reached) { if (right_level == 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, ctx); @@ -5508,8 +5507,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, continue; } else if (right_end_reached) { if (left_level == 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, ctx); @@ -5523,8 +5521,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, if (left_level == 0 && right_level == 0) { cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, ctx); @@ -5532,8 +5529,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; advance_left = ADVANCE; } else if (cmp > 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, ctx); @@ -5550,8 +5546,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, result = BTRFS_COMPARE_TREE_CHANGED; else result = BTRFS_COMPARE_TREE_SAME; - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &left_key, result, ctx); if (ret < 0) goto out; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fc690384c58..f7df5536ab61 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -523,7 +523,7 @@ struct btrfs_caching_control { }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ -#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2) +#define CACHING_CTL_WAKE_UP SZ_2M struct btrfs_io_ctl { void *cur, *orig; @@ -763,8 +763,6 @@ struct btrfs_fs_info { * delayed dir index item */ struct btrfs_block_rsv global_block_rsv; - /* block reservation for delay allocation */ - struct btrfs_block_rsv delalloc_block_rsv; /* block reservation for metadata operations */ struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ @@ -790,6 +788,7 @@ struct btrfs_fs_info { */ unsigned long pending_changes; unsigned long compress_type:4; + unsigned int compress_level; int commit_interval; /* * It is a suggestive number, the read side is safe even it gets a @@ -878,9 +877,6 @@ struct btrfs_fs_info { rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; - atomic_t nr_async_submits; - atomic_t async_submit_draining; - atomic_t nr_async_bios; atomic_t async_delalloc_pages; atomic_t open_ioctl_trans; @@ -1100,6 +1096,11 @@ struct btrfs_fs_info { u32 nodesize; u32 sectorsize; u32 stripesize; + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + spinlock_t ref_verify_lock; + struct rb_root block_tree; +#endif }; static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) @@ -1338,6 +1339,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) +#define BTRFS_MOUNT_REF_VERIFY (1 << 28) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) @@ -2639,7 +2641,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, + struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, @@ -2658,7 +2660,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); @@ -2670,7 +2672,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); @@ -2744,6 +2746,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, u64 *qgroup_reserved, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); + int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, @@ -2751,6 +2755,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode, void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); @@ -2809,6 +2816,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); @@ -2821,9 +2829,7 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; -typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, - struct btrfs_root *right_root, - struct btrfs_path *left_path, +typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 19e4ad2f3f2e..5d73f79ded8b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -581,7 +581,6 @@ static int btrfs_delayed_inode_reserve_metadata( struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - bool release = false; src_rsv = trans->block_rsv; dst_rsv = &fs_info->delayed_block_rsv; @@ -589,36 +588,13 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); /* - * If our block_rsv is the delalloc block reserve then check and see if - * we have our extra reservation for updating the inode. If not fall - * through and try to reserve space quickly. - * - * We used to try and steal from the delalloc block rsv or the global - * reserve, but we'd steal a full reservation, which isn't kind. We are - * here through delalloc which means we've likely just cowed down close - * to the leaf that contains the inode, so we would steal less just - * doing the fallback inode update, so if we do end up having to steal - * from the global block rsv we hopefully only steal one or two blocks - * worth which is less likely to hurt us. - */ - if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { - spin_lock(&inode->lock); - if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &inode->runtime_flags)) - release = true; - else - src_rsv = NULL; - spin_unlock(&inode->lock); - } - - /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we * still need to reserve space for this update, so try to reserve the * space. * * Now if src_rsv == delalloc_block_rsv we'll let it just steal since - * we're accounted for. + * we always reserve enough to update the inode item. */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { @@ -643,32 +619,12 @@ static int btrfs_delayed_inode_reserve_metadata( } ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); - - /* - * Migrate only takes a reservation, it doesn't touch the size of the - * block_rsv. This is to simplify people who don't normally have things - * migrated from their block rsv. If they go to release their - * reservation, that will decrease the size as well, so if migrate - * reduced size we'd end up with a negative size. But for the - * delalloc_meta_reserved stuff we will only know to drop 1 reservation, - * but we could in fact do this reserve/migrate dance several times - * between the time we did the original reservation and we'd clean it - * up. So to take care of this, release the space for the meta - * reservation here. I think it may be time for a documentation page on - * how block rsvs. work. - */ if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_inode", btrfs_ino(inode), num_bytes, 1); node->bytes_reserved = num_bytes; } - if (release) { - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), num_bytes, 0); - btrfs_block_rsv_release(fs_info, src_rsv, num_bytes); - } - return ret; } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 93ffa898df6d..83be8f9fd906 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -40,10 +40,10 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * compare two delayed tree backrefs with same bytenr and type */ -static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, - struct btrfs_delayed_tree_ref *ref1, int type) +static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, + struct btrfs_delayed_tree_ref *ref2) { - if (type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { if (ref1->root < ref2->root) return -1; if (ref1->root > ref2->root) @@ -60,8 +60,8 @@ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, /* * compare two delayed data backrefs with same bytenr and type */ -static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, - struct btrfs_delayed_data_ref *ref1) +static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, + struct btrfs_delayed_data_ref *ref2) { if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { if (ref1->root < ref2->root) @@ -85,6 +85,34 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, return 0; } +static int comp_refs(struct btrfs_delayed_ref_node *ref1, + struct btrfs_delayed_ref_node *ref2, + bool check_seq) +{ + int ret = 0; + + if (ref1->type < ref2->type) + return -1; + if (ref1->type > ref2->type) + return 1; + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), + btrfs_delayed_node_to_tree_ref(ref2)); + else + ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), + btrfs_delayed_node_to_data_ref(ref2)); + if (ret) + return ret; + if (check_seq) { + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; + } + return 0; +} + /* insert a new ref to head ref rbtree */ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, struct rb_node *node) @@ -96,15 +124,43 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, u64 bytenr; ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); - bytenr = ins->node.bytenr; + bytenr = ins->bytenr; while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->node.bytenr) + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + +static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, + struct btrfs_delayed_ref_node *ins) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *node = &ins->ref_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_node *entry; + + while (*p) { + int comp; + + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, + ref_node); + comp = comp_refs(ins, entry, true); + if (comp < 0) p = &(*p)->rb_left; - else if (bytenr > entry->node.bytenr) + else if (comp > 0) p = &(*p)->rb_right; else return entry; @@ -133,15 +189,15 @@ find_ref_head(struct rb_root *root, u64 bytenr, while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->node.bytenr) + if (bytenr < entry->bytenr) n = n->rb_left; - else if (bytenr > entry->node.bytenr) + else if (bytenr > entry->bytenr) n = n->rb_right; else return entry; } if (entry && return_bigger) { - if (bytenr > entry->node.bytenr) { + if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) n = rb_first(root); @@ -164,17 +220,17 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, if (mutex_trylock(&head->mutex)) return 0; - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); - if (!head->node.in_tree) { + if (RB_EMPTY_NODE(&head->href_node)) { mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return -EAGAIN; } - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return 0; } @@ -183,15 +239,11 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { - if (btrfs_delayed_ref_is_head(ref)) { - head = btrfs_delayed_node_to_head(ref); - rb_erase(&head->href_node, &delayed_refs->href_root); - } else { - assert_spin_locked(&head->lock); - list_del(&ref->list); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - } + assert_spin_locked(&head->lock); + rb_erase(&ref->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); @@ -206,36 +258,18 @@ static bool merge_ref(struct btrfs_trans_handle *trans, u64 seq) { struct btrfs_delayed_ref_node *next; + struct rb_node *node = rb_next(&ref->ref_node); bool done = false; - next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, - list); - while (!done && &next->list != &head->ref_list) { + while (!done && node) { int mod; - struct btrfs_delayed_ref_node *next2; - - next2 = list_next_entry(next, list); - - if (next == ref) - goto next; + next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + node = rb_next(node); if (seq && next->seq >= seq) - goto next; - - if (next->type != ref->type) - goto next; - - if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY || - ref->type == BTRFS_SHARED_BLOCK_REF_KEY) && - comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref), - btrfs_delayed_node_to_tree_ref(next), - ref->type)) - goto next; - if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY || - ref->type == BTRFS_SHARED_DATA_REF_KEY) && - comp_data_refs(btrfs_delayed_node_to_data_ref(ref), - btrfs_delayed_node_to_data_ref(next))) - goto next; + break; + if (comp_refs(ref, next, false)) + break; if (ref->action == next->action) { mod = next->ref_mod; @@ -259,8 +293,6 @@ static bool merge_ref(struct btrfs_trans_handle *trans, WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || ref->type == BTRFS_SHARED_BLOCK_REF_KEY); } -next: - next = next2; } return done; @@ -272,11 +304,12 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; + struct rb_node *node; u64 seq = 0; assert_spin_locked(&head->lock); - if (list_empty(&head->ref_list)) + if (RB_EMPTY_ROOT(&head->ref_tree)) return; /* We don't have too many refs to merge for data. */ @@ -293,22 +326,13 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, } spin_unlock(&fs_info->tree_mod_seq_lock); - ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, - list); - while (&ref->list != &head->ref_list) { +again: + for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) - goto next; - - if (merge_ref(trans, delayed_refs, head, ref, seq)) { - if (list_empty(&head->ref_list)) - break; - ref = list_first_entry(&head->ref_list, - struct btrfs_delayed_ref_node, - list); continue; - } -next: - ref = list_next_entry(ref, list); + if (merge_ref(trans, delayed_refs, head, ref, seq)) + goto again; } } @@ -380,8 +404,8 @@ again: head->processing = 1; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--; - delayed_refs->run_delayed_start = head->node.bytenr + - head->node.num_bytes; + delayed_refs->run_delayed_start = head->bytenr + + head->num_bytes; return head; } @@ -391,37 +415,19 @@ again: * Return 0 for insert. * Return >0 for merge. */ -static int -add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *root, - struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *ref) +static int insert_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { struct btrfs_delayed_ref_node *exist; int mod; int ret = 0; spin_lock(&href->lock); - /* Check whether we can merge the tail node with ref */ - if (list_empty(&href->ref_list)) - goto add_tail; - exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, - list); - /* No need to compare bytenr nor is_head */ - if (exist->type != ref->type || exist->seq != ref->seq) - goto add_tail; - - if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || - exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && - comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), - btrfs_delayed_node_to_tree_ref(ref), - ref->type)) - goto add_tail; - if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || - exist->type == BTRFS_SHARED_DATA_REF_KEY) && - comp_data_refs(btrfs_delayed_node_to_data_ref(exist), - btrfs_delayed_node_to_data_ref(ref))) - goto add_tail; + exist = tree_insert(&href->ref_tree, ref); + if (!exist) + goto inserted; /* Now we are sure we can merge */ ret = 1; @@ -452,9 +458,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, drop_delayed_ref(trans, root, href, exist); spin_unlock(&href->lock); return ret; - -add_tail: - list_add_tail(&ref->list, &href->ref_list); +inserted: if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); @@ -469,20 +473,16 @@ add_tail: */ static noinline void update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update, + struct btrfs_delayed_ref_head *existing, + struct btrfs_delayed_ref_head *update, int *old_ref_mod_ret) { - struct btrfs_delayed_ref_head *existing_ref; - struct btrfs_delayed_ref_head *ref; int old_ref_mod; - existing_ref = btrfs_delayed_node_to_head(existing); - ref = btrfs_delayed_node_to_head(update); - BUG_ON(existing_ref->is_data != ref->is_data); + BUG_ON(existing->is_data != update->is_data); - spin_lock(&existing_ref->lock); - if (ref->must_insert_reserved) { + spin_lock(&existing->lock); + if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref * entries were processed, we can end up @@ -490,7 +490,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * the must_insert_reserved flag set. * Set it again here */ - existing_ref->must_insert_reserved = ref->must_insert_reserved; + existing->must_insert_reserved = update->must_insert_reserved; /* * update the num_bytes so we make sure the accounting @@ -500,22 +500,22 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, } - if (ref->extent_op) { - if (!existing_ref->extent_op) { - existing_ref->extent_op = ref->extent_op; + if (update->extent_op) { + if (!existing->extent_op) { + existing->extent_op = update->extent_op; } else { - if (ref->extent_op->update_key) { - memcpy(&existing_ref->extent_op->key, - &ref->extent_op->key, - sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = true; + if (update->extent_op->update_key) { + memcpy(&existing->extent_op->key, + &update->extent_op->key, + sizeof(update->extent_op->key)); + existing->extent_op->update_key = true; } - if (ref->extent_op->update_flags) { - existing_ref->extent_op->flags_to_set |= - ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = true; + if (update->extent_op->update_flags) { + existing->extent_op->flags_to_set |= + update->extent_op->flags_to_set; + existing->extent_op->update_flags = true; } - btrfs_free_delayed_extent_op(ref->extent_op); + btrfs_free_delayed_extent_op(update->extent_op); } } /* @@ -523,23 +523,23 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ - old_ref_mod = existing_ref->total_ref_mod; + old_ref_mod = existing->total_ref_mod; if (old_ref_mod_ret) *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; - existing_ref->total_ref_mod += update->ref_mod; + existing->total_ref_mod += update->ref_mod; /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. */ - if (existing_ref->is_data) { - if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) + if (existing->is_data) { + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) delayed_refs->pending_csums -= existing->num_bytes; - if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) delayed_refs->pending_csums += existing->num_bytes; } - spin_unlock(&existing_ref->lock); + spin_unlock(&existing->lock); } /* @@ -550,14 +550,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, + struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, int action, int is_data, int *qrecord_inserted_ret, int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_ref_head *existing; - struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; int must_insert_reserved = 0; @@ -593,26 +592,21 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; - /* first set the basic ref node struct up */ - refcount_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = count_mod; - ref->type = 0; - ref->action = 0; - ref->is_head = 1; - ref->in_tree = 1; - ref->seq = 0; - - head_ref = btrfs_delayed_node_to_head(ref); + refcount_set(&head_ref->refs, 1); + head_ref->bytenr = bytenr; + head_ref->num_bytes = num_bytes; + head_ref->ref_mod = count_mod; head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; - INIT_LIST_HEAD(&head_ref->ref_list); + head_ref->ref_tree = RB_ROOT; INIT_LIST_HEAD(&head_ref->ref_add_list); + RB_CLEAR_NODE(&head_ref->href_node); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; head_ref->qgroup_reserved = 0; head_ref->qgroup_ref_root = 0; + spin_lock_init(&head_ref->lock); + mutex_init(&head_ref->mutex); /* Record qgroup extent info if provided */ if (qrecord) { @@ -632,17 +626,14 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord_inserted = 1; } - spin_lock_init(&head_ref->lock); - mutex_init(&head_ref->mutex); - - trace_add_delayed_ref_head(fs_info, ref, head_ref, action); + trace_add_delayed_ref_head(fs_info, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { WARN_ON(ref_root && reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, &existing->node, ref, + update_existing_head_ref(delayed_refs, existing, head_ref, old_ref_mod); /* * we've updated the existing ref, free the newly @@ -699,7 +690,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; - INIT_LIST_HEAD(&ref->list); + RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -713,7 +704,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); - ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); /* * XXX: memory should be freed at the same level allocated. @@ -756,7 +747,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; - INIT_LIST_HEAD(&ref->list); + RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -772,8 +763,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(fs_info, ref, full_ref, action); - ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); - + ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); if (ret > 0) kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } @@ -821,7 +811,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, + head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, 0, 0, action, 0, &qrecord_inserted, old_ref_mod, new_ref_mod); @@ -888,7 +878,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, + head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, ref_root, reserved, action, 1, &qrecord_inserted, old_ref_mod, new_ref_mod); @@ -920,7 +910,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, + add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, NULL, NULL, NULL); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index ce88e4ac5276..a43af432f859 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -26,18 +26,8 @@ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ -/* - * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the - * same ref_node structure. - * Ref_head is in a higher logic level than tree/data ref, and duplicated - * bytenr/num_bytes in ref_node is really a waste or memory, they should be - * referred from ref_head. - * This gets more disgusting after we use list to store tree/data ref in - * ref_head. Must clean this mess up later. - */ struct btrfs_delayed_ref_node { - /*data/tree ref use list, stored in ref_head->ref_list. */ - struct list_head list; + struct rb_node ref_node; /* * If action is BTRFS_ADD_DELAYED_REF, also link this node to * ref_head->ref_add_list, then we do not need to iterate the @@ -91,8 +81,9 @@ struct btrfs_delayed_extent_op { * reference count modifications we've queued up. */ struct btrfs_delayed_ref_head { - struct btrfs_delayed_ref_node node; - + u64 bytenr; + u64 num_bytes; + refcount_t refs; /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. @@ -100,7 +91,7 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct list_head ref_list; + struct rb_root ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; @@ -116,6 +107,14 @@ struct btrfs_delayed_ref_head { int total_ref_mod; /* + * This is the current outstanding mod references for this bytenr. This + * is used with lookup_extent_info to get an accurate reference count + * for a bytenr, so it is adjusted as delayed refs are run so that any + * on disk reference count + ref_mod is accurate. + */ + int ref_mod; + + /* * For qgroup reserved space freeing. * * ref_root and reserved will be recorded after @@ -234,15 +233,18 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) case BTRFS_SHARED_DATA_REF_KEY: kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); break; - case 0: - kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); - break; default: BUG(); } } } +static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) +{ + if (refcount_dec_and_test(&head->refs)) + kmem_cache_free(btrfs_delayed_ref_head_cachep, head); +} + int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, @@ -283,35 +285,17 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); /* - * a node might live in a head or a regular ref, this lets you - * test for the proper type to use. - */ -static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) -{ - return node->is_head; -} - -/* * helper functions to cast a node into its container */ static inline struct btrfs_delayed_tree_ref * btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) { - WARN_ON(btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_tree_ref, node); } static inline struct btrfs_delayed_data_ref * btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) { - WARN_ON(btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_data_ref, node); } - -static inline struct btrfs_delayed_ref_head * -btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) -{ - WARN_ON(!btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_ref_head, node); -} #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dfdab849037b..efce9a2fa9be 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,6 +50,8 @@ #include "sysfs.h" #include "qgroup.h" #include "compression.h" +#include "tree-checker.h" +#include "ref-verify.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -543,146 +545,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, return ret; } -#define CORRUPT(reason, eb, root, slot) \ - btrfs_crit(root->fs_info, \ - "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \ - btrfs_header_level(eb) == 0 ? "leaf" : "node", \ - reason, btrfs_header_bytenr(eb), root->objectid, slot) - -static noinline int check_leaf(struct btrfs_root *root, - struct extent_buffer *leaf) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_key key; - struct btrfs_key leaf_key; - u32 nritems = btrfs_header_nritems(leaf); - int slot; - - /* - * Extent buffers from a relocation tree have a owner field that - * corresponds to the subvolume tree they are based on. So just from an - * extent buffer alone we can not find out what is the id of the - * corresponding subvolume tree, so we can not figure out if the extent - * buffer corresponds to the root of the relocation tree or not. So skip - * this check for relocation trees. - */ - if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { - struct btrfs_root *check_root; - - key.objectid = btrfs_header_owner(leaf); - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - check_root = btrfs_get_fs_root(fs_info, &key, false); - /* - * The only reason we also check NULL here is that during - * open_ctree() some roots has not yet been set up. - */ - if (!IS_ERR_OR_NULL(check_root)) { - struct extent_buffer *eb; - - eb = btrfs_root_node(check_root); - /* if leaf is the root, then it's fine */ - if (leaf != eb) { - CORRUPT("non-root leaf's nritems is 0", - leaf, check_root, 0); - free_extent_buffer(eb); - return -EIO; - } - free_extent_buffer(eb); - } - return 0; - } - - if (nritems == 0) - return 0; - - /* Check the 0 item */ - if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != - BTRFS_LEAF_DATA_SIZE(fs_info)) { - CORRUPT("invalid item offset size pair", leaf, root, 0); - return -EIO; - } - - /* - * Check to make sure each items keys are in the correct order and their - * offsets make sense. We only have to loop through nritems-1 because - * we check the current slot against the next slot, which verifies the - * next slot's offset+size makes sense and that the current's slot - * offset is correct. - */ - for (slot = 0; slot < nritems - 1; slot++) { - btrfs_item_key_to_cpu(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &key, slot + 1); - - /* Make sure the keys are in the right order */ - if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { - CORRUPT("bad key order", leaf, root, slot); - return -EIO; - } - - /* - * Make sure the offset and ends are right, remember that the - * item data starts at the end of the leaf and grows towards the - * front. - */ - if (btrfs_item_offset_nr(leaf, slot) != - btrfs_item_end_nr(leaf, slot + 1)) { - CORRUPT("slot offset bad", leaf, root, slot); - return -EIO; - } - - /* - * Check to make sure that we don't point outside of the leaf, - * just in case all the items are consistent to each other, but - * all point outside of the leaf. - */ - if (btrfs_item_end_nr(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info)) { - CORRUPT("slot end outside of leaf", leaf, root, slot); - return -EIO; - } - } - - return 0; -} - -static int check_node(struct btrfs_root *root, struct extent_buffer *node) -{ - unsigned long nr = btrfs_header_nritems(node); - struct btrfs_key key, next_key; - int slot; - u64 bytenr; - int ret = 0; - - if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { - btrfs_crit(root->fs_info, - "corrupt node: block %llu root %llu nritems %lu", - node->start, root->objectid, nr); - return -EIO; - } - - for (slot = 0; slot < nr - 1; slot++) { - bytenr = btrfs_node_blockptr(node, slot); - btrfs_node_key_to_cpu(node, &key, slot); - btrfs_node_key_to_cpu(node, &next_key, slot + 1); - - if (!bytenr) { - CORRUPT("invalid item slot", node, root, slot); - ret = -EIO; - goto out; - } - - if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { - CORRUPT("bad key order", node, root, slot); - ret = -EIO; - goto out; - } - } -out: - return ret; -} - static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) @@ -748,12 +610,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && check_leaf(root, eb)) { + if (found_level == 0 && btrfs_check_leaf(root, eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } - if (found_level > 0 && check_node(root, eb)) + if (found_level > 0 && btrfs_check_node(root, eb)) ret = -EIO; if (!ret) @@ -879,22 +741,9 @@ static void run_one_async_start(struct btrfs_work *work) static void run_one_async_done(struct btrfs_work *work) { - struct btrfs_fs_info *fs_info; struct async_submit_bio *async; - int limit; async = container_of(work, struct async_submit_bio, work); - fs_info = async->fs_info; - - limit = btrfs_async_submit_limit(fs_info); - limit = limit * 2 / 3; - - /* - * atomic_dec_return implies a barrier for waitqueue_active - */ - if (atomic_dec_return(&fs_info->nr_async_submits) < limit && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -942,19 +791,10 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, async->status = 0; - atomic_inc(&fs_info->nr_async_submits); - if (op_is_sync(bio->bi_opf)) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); - - while (atomic_read(&fs_info->async_submit_draining) && - atomic_read(&fs_info->nr_async_submits)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0)); - } - return 0; } @@ -1005,9 +845,9 @@ static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, return ret; } -static int check_async_write(unsigned long bio_flags) +static int check_async_write(struct btrfs_inode *bi) { - if (bio_flags & EXTENT_BIO_TREE_LOG) + if (atomic_read(&bi->sync_writers)) return 0; #ifdef CONFIG_X86 if (static_cpu_has(X86_FEATURE_XMM4_2)) @@ -1022,7 +862,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, { struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(bio_flags); + int async = check_async_write(BTRFS_I(inode)); blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { @@ -2607,14 +2447,6 @@ int open_ctree(struct super_block *sb, goto fail_delalloc_bytes; } - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; - goto fail_bio_counter; - } - - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -2647,17 +2479,12 @@ int open_ctree(struct super_block *sb, btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); - btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, - BTRFS_BLOCK_RSV_DELALLOC); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); - atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); - atomic_set(&fs_info->async_submit_draining, 0); - atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); @@ -2673,12 +2500,21 @@ int open_ctree(struct super_block *sb, /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); spin_lock_init(&fs_info->reada_lock); + btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_bio_counter; + } + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_KERNEL); if (!fs_info->delayed_root) { @@ -2895,12 +2731,13 @@ int open_ctree(struct super_block *sb, sb->s_bdi->congested_fn = btrfs_congested_fn; sb->s_bdi->congested_data = fs_info; sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); + memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(fs_info); @@ -3083,6 +2920,9 @@ retry_root_backup: if (ret) goto fail_trans_kthread; + if (btrfs_build_ref_tree(fs_info)) + btrfs_err(fs_info, "couldn't build ref tree"); + /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { @@ -3948,6 +3788,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); + btrfs_free_ref_cache(fs_info); __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; @@ -4007,7 +3848,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->len, fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) { btrfs_print_leaf(buf); ASSERT(0); } @@ -4272,26 +4113,28 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_node *tmp; + struct rb_node *n; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); spin_lock(&delayed_refs->lock); continue; } spin_lock(&head->lock); - list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, - list) { + while ((n = rb_first(&head->ref_tree)) != NULL) { + ref = rb_entry(n, struct btrfs_delayed_ref_node, + ref_node); ref->in_tree = 0; - list_del(&ref->list); + rb_erase(&ref->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); @@ -4304,16 +4147,16 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (head->processing == 0) delayed_refs->num_heads_ready--; atomic_dec(&delayed_refs->num_entries); - head->node.in_tree = 0; rb_erase(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); if (pin_bytes) - btrfs_pin_extent(fs_info, head->node.bytenr, - head->node.num_bytes, 1); - btrfs_put_delayed_ref(&head->node); + btrfs_pin_extent(fs_info, head->bytenr, + head->num_bytes, 1); + btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index fa66980726c9..3aeb5770f896 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/types.h> #include "ctree.h" diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 074348a95841..91b3908e7c54 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_EXPORT_H #define BTRFS_EXPORT_H diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e2d7e86b51d1..673ac4e01dd0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/percpu_counter.h> +#include <linux/lockdep.h> #include "hash.h" #include "tree-log.h" #include "disk-io.h" @@ -38,6 +39,7 @@ #include "math.h" #include "sysfs.h" #include "qgroup.h" +#include "ref-verify.h" #undef SCRAMBLE_DELAYED_REFS @@ -61,9 +63,6 @@ enum { CHUNK_ALLOC_FORCE = 2, }; -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, u64 parent, @@ -91,17 +90,8 @@ static int find_next_key(struct btrfs_path *path, int level, static void dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 ram_bytes, u64 num_bytes, int delalloc); -static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk); static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes); @@ -652,7 +642,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_FAST; spin_unlock(&cache->lock); - if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { mutex_lock(&caching_ctl->mutex); ret = load_free_space_cache(fs_info, cache); @@ -923,7 +913,7 @@ search_again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -934,7 +924,7 @@ search_again: */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); goto search_again; } spin_lock(&head->lock); @@ -943,7 +933,7 @@ search_again: else BUG_ON(num_refs == 0); - num_refs += head->node.ref_mod; + num_refs += head->ref_mod; spin_unlock(&head->lock); mutex_unlock(&head->mutex); } @@ -2189,16 +2179,20 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, /* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + struct btrfs_fs_info *fs_info = root->fs_info; int old_ref_mod, new_ref_mod; int ret; BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && root_objectid == BTRFS_TREE_LOG_OBJECTID); + btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid, + owner, offset, BTRFS_ADD_DELAYED_REF); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, @@ -2344,7 +2338,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_key key; @@ -2366,14 +2360,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - key.objectid = node->bytenr; + key.objectid = head->bytenr; if (metadata) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = extent_op->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = node->num_bytes; + key.offset = head->num_bytes; } again: @@ -2390,17 +2384,17 @@ again: path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == node->bytenr && + if (key.objectid == head->bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == node->num_bytes) + key.offset == head->num_bytes) ret = 0; } if (ret > 0) { btrfs_release_path(path); metadata = 0; - key.objectid = node->bytenr; - key.offset = node->num_bytes; + key.objectid = head->bytenr; + key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; goto again; } @@ -2507,44 +2501,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return 0; } - if (btrfs_delayed_ref_is_head(node)) { - struct btrfs_delayed_ref_head *head; - /* - * we've hit the end of the chain and we were supposed - * to insert this extent into the tree. But, it got - * deleted before we ever needed to insert it, so all - * we have to do is clean up the accounting - */ - BUG_ON(extent_op); - head = btrfs_delayed_node_to_head(node); - trace_run_delayed_ref_head(fs_info, node, head, node->action); - - if (head->total_ref_mod < 0) { - struct btrfs_block_group_cache *cache; - - cache = btrfs_lookup_block_group(fs_info, node->bytenr); - ASSERT(cache); - percpu_counter_add(&cache->space_info->total_bytes_pinned, - -node->num_bytes); - btrfs_put_block_group(cache); - } - - if (insert_reserved) { - btrfs_pin_extent(fs_info, node->bytenr, - node->num_bytes, 1); - if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info, - node->bytenr, - node->num_bytes); - } - } - - /* Also free its reserved qgroup space */ - btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, - head->qgroup_reserved); - return ret; - } - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, @@ -2563,7 +2519,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; - if (list_empty(&head->ref_list)) + if (RB_EMPTY_ROOT(&head->ref_tree)) return NULL; /* @@ -2576,12 +2532,114 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) return list_first_entry(&head->ref_add_list, struct btrfs_delayed_ref_node, add_list); - ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, - list); + ref = rb_entry(rb_first(&head->ref_tree), + struct btrfs_delayed_ref_node, ref_node); ASSERT(list_empty(&ref->add_list)); return ref; } +static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + spin_lock(&delayed_refs->lock); + head->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(head); +} + +static int cleanup_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_extent_op *extent_op = head->extent_op; + int ret; + + if (!extent_op) + return 0; + head->extent_op = NULL; + if (head->must_insert_reserved) { + btrfs_free_delayed_extent_op(extent_op); + return 0; + } + spin_unlock(&head->lock); + ret = run_delayed_extent_op(trans, fs_info, head, extent_op); + btrfs_free_delayed_extent_op(extent_op); + return ret ? ret : 1; +} + +static int cleanup_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + delayed_refs = &trans->transaction->delayed_refs; + + ret = cleanup_extent_op(trans, fs_info, head); + if (ret < 0) { + unselect_delayed_ref_head(delayed_refs, head); + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); + return ret; + } else if (ret) { + return ret; + } + + /* + * Need to drop our head ref lock and re-acquire the delayed ref lock + * and then re-check to make sure nobody got added. + */ + spin_unlock(&head->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&head->lock); + if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) { + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + return 1; + } + delayed_refs->num_heads--; + rb_erase(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); + spin_unlock(&delayed_refs->lock); + spin_unlock(&head->lock); + atomic_dec(&delayed_refs->num_entries); + + trace_run_delayed_ref_head(fs_info, head, 0); + + if (head->total_ref_mod < 0) { + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(fs_info, head->bytenr); + ASSERT(cache); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + -head->num_bytes); + btrfs_put_block_group(cache); + + if (head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + } + } + + if (head->must_insert_reserved) { + btrfs_pin_extent(fs_info, head->bytenr, + head->num_bytes, 1); + if (head->is_data) { + ret = btrfs_del_csums(trans, fs_info, head->bytenr, + head->num_bytes); + } + } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, + head->qgroup_reserved); + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); + return 0; +} + /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. @@ -2655,11 +2713,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { spin_unlock(&locked_ref->lock); - spin_lock(&delayed_refs->lock); - locked_ref->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(locked_ref); + unselect_delayed_ref_head(delayed_refs, locked_ref); locked_ref = NULL; cond_resched(); count++; @@ -2667,102 +2721,55 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } /* - * record the must insert reserved flag before we - * drop the spin lock. + * We're done processing refs in this ref_head, clean everything + * up and move on to the next ref_head. */ - must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; - - extent_op = locked_ref->extent_op; - locked_ref->extent_op = NULL; - if (!ref) { - - - /* All delayed refs have been processed, Go ahead - * and send the head node to run_one_delayed_ref, - * so that any accounting fixes can happen - */ - ref = &locked_ref->node; - - if (extent_op && must_insert_reserved) { - btrfs_free_delayed_extent_op(extent_op); - extent_op = NULL; - } - - if (extent_op) { - spin_unlock(&locked_ref->lock); - ret = run_delayed_extent_op(trans, fs_info, - ref, extent_op); - btrfs_free_delayed_extent_op(extent_op); - - if (ret) { - /* - * Need to reset must_insert_reserved if - * there was an error so the abort stuff - * can cleanup the reserved space - * properly. - */ - if (must_insert_reserved) - locked_ref->must_insert_reserved = 1; - spin_lock(&delayed_refs->lock); - locked_ref->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_debug(fs_info, - "run_delayed_extent_op returned %d", - ret); - btrfs_delayed_ref_unlock(locked_ref); - return ret; - } + ret = cleanup_ref_head(trans, fs_info, locked_ref); + if (ret > 0 ) { + /* We dropped our lock, we need to loop. */ + ret = 0; continue; + } else if (ret) { + return ret; } + locked_ref = NULL; + count++; + continue; + } - /* - * Need to drop our head ref lock and re-acquire the - * delayed ref lock and then re-check to make sure - * nobody got added. - */ - spin_unlock(&locked_ref->lock); - spin_lock(&delayed_refs->lock); - spin_lock(&locked_ref->lock); - if (!list_empty(&locked_ref->ref_list) || - locked_ref->extent_op) { - spin_unlock(&locked_ref->lock); - spin_unlock(&delayed_refs->lock); - continue; - } - ref->in_tree = 0; - delayed_refs->num_heads--; - rb_erase(&locked_ref->href_node, - &delayed_refs->href_root); - spin_unlock(&delayed_refs->lock); - } else { - actual_count++; - ref->in_tree = 0; - list_del(&ref->list); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); + actual_count++; + ref->in_tree = 0; + rb_erase(&ref->ref_node, &locked_ref->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); + /* + * When we play the delayed ref, also correct the ref_mod on + * head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); } atomic_dec(&delayed_refs->num_entries); - if (!btrfs_delayed_ref_is_head(ref)) { - /* - * when we play the delayed ref, also correct the - * ref_mod on head - */ - switch (ref->action) { - case BTRFS_ADD_DELAYED_REF: - case BTRFS_ADD_DELAYED_EXTENT: - locked_ref->node.ref_mod -= ref->ref_mod; - break; - case BTRFS_DROP_DELAYED_REF: - locked_ref->node.ref_mod += ref->ref_mod; - break; - default: - WARN_ON(1); - } - } + /* + * Record the must-insert_reserved flag before we drop the spin + * lock. + */ + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; + + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, @@ -2770,33 +2777,13 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - spin_lock(&delayed_refs->lock); - locked_ref->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(locked_ref); + unselect_delayed_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); return ret; } - /* - * If this node is a head, that means all the refs in this head - * have been dealt with, and we will pick the next head to deal - * with, so we must unlock the head and drop it from the cluster - * list before we release it. - */ - if (btrfs_delayed_ref_is_head(ref)) { - if (locked_ref->is_data && - locked_ref->total_ref_mod < 0) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= ref->num_bytes; - spin_unlock(&delayed_refs->lock); - } - btrfs_delayed_ref_unlock(locked_ref); - locked_ref = NULL; - } btrfs_put_delayed_ref(ref); count++; cond_resched(); @@ -3100,33 +3087,16 @@ again: spin_unlock(&delayed_refs->lock); goto out; } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); - while (node) { - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - if (btrfs_delayed_ref_is_head(&head->node)) { - struct btrfs_delayed_ref_node *ref; - - ref = &head->node; - refcount_inc(&ref->refs); - - spin_unlock(&delayed_refs->lock); - /* - * Mutex was contended, block until it's - * released and try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); + /* Mutex was contended, block until it's released and retry. */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - cond_resched(); - goto again; - } else { - WARN_ON(1); - } - node = rb_next(node); - } - spin_unlock(&delayed_refs->lock); + btrfs_put_delayed_ref_head(head); cond_resched(); goto again; } @@ -3169,6 +3139,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_transaction *cur_trans; + struct rb_node *node; int ret = 0; cur_trans = root->fs_info->running_transaction; @@ -3184,7 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -3195,13 +3166,18 @@ static noinline int check_delayed_ref(struct btrfs_root *root, */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return -EAGAIN; } spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); - list_for_each_entry(ref, &head->ref_list, list) { + /* + * XXX: We should replace this with a proper search function in the + * future. + */ + for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; @@ -3351,7 +3327,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, - struct btrfs_fs_info *, + struct btrfs_root *, u64, u64, u64, u64, u64, u64); @@ -3391,7 +3367,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); key.offset -= btrfs_file_extent_offset(buf, fi); - ret = process_func(trans, fs_info, bytenr, num_bytes, + ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, key.offset); if (ret) @@ -3399,7 +3375,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = fs_info->nodesize; - ret = process_func(trans, fs_info, bytenr, num_bytes, + ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0); if (ret) goto fail; @@ -4843,7 +4819,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, u64 orig, bool wait_ordered) { - struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; @@ -4859,8 +4834,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, to_reclaim = items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; - block_rsv = &fs_info->delalloc_block_rsv; - space_info = block_rsv->space_info; + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); @@ -4919,6 +4893,13 @@ skip_async: } } +struct reserve_ticket { + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; +}; + /** * maybe_commit_transaction - possibly commit the transaction if its ok to * @root - the root we're allocating for @@ -4930,18 +4911,29 @@ skip_async: * will return -ENOSPC. */ static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 bytes, int force) + struct btrfs_space_info *space_info) { + struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; struct btrfs_trans_handle *trans; + u64 bytes; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) return -EAGAIN; - if (force) - goto commit; + spin_lock(&space_info->lock); + if (!list_empty(&space_info->priority_tickets)) + ticket = list_first_entry(&space_info->priority_tickets, + struct reserve_ticket, list); + else if (!list_empty(&space_info->tickets)) + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + bytes = (ticket) ? ticket->bytes : 0; + spin_unlock(&space_info->lock); + + if (!bytes) + return 0; /* See if there is enough pinned space to make this reservation */ if (percpu_counter_compare(&space_info->total_bytes_pinned, @@ -4956,8 +4948,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, return -ENOSPC; spin_lock(&delayed_rsv->lock); + if (delayed_rsv->size > bytes) + bytes = 0; + else + bytes -= delayed_rsv->size; if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes - delayed_rsv->size) < 0) { + bytes) < 0) { spin_unlock(&delayed_rsv->lock); return -ENOSPC; } @@ -4971,13 +4967,6 @@ commit: return btrfs_commit_transaction(trans); } -struct reserve_ticket { - u64 bytes; - int error; - struct list_head list; - wait_queue_head_t wait; -}; - /* * Try to flush some data based on policy set by @state. This is only advisory * and may fail for various reasons. The caller is supposed to examine the @@ -5027,8 +5016,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = 0; break; case COMMIT_TRANS: - ret = may_commit_transaction(fs_info, space_info, - num_bytes, 0); + ret = may_commit_transaction(fs_info, space_info); break; default: ret = -ENOSPC; @@ -5582,11 +5570,12 @@ again: } } -static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, +static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) { struct btrfs_space_info *space_info = block_rsv->space_info; + u64 ret; spin_lock(&block_rsv->lock); if (num_bytes == (u64)-1) @@ -5601,6 +5590,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } spin_unlock(&block_rsv->lock); + ret = num_bytes; if (num_bytes > 0) { if (dest) { spin_lock(&dest->lock); @@ -5620,6 +5610,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, space_info_add_old_bytes(fs_info, space_info, num_bytes); } + return ret; } int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, @@ -5643,6 +5634,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) rsv->type = type; } +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type) +{ + btrfs_init_block_rsv(rsv, type); + rsv->space_info = __find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); +} + struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type) { @@ -5652,9 +5652,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, if (!block_rsv) return NULL; - btrfs_init_block_rsv(block_rsv, type); - block_rsv->space_info = __find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); + btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); return block_rsv; } @@ -5737,6 +5735,66 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } +/** + * btrfs_inode_rsv_refill - refill the inode block rsv. + * @inode - the inode we are refilling. + * @flush - the flusing restriction. + * + * Essentially the same as btrfs_block_rsv_refill, except it uses the + * block_rsv->size as the minimum size. We'll either refill the missing amount + * or return if we already have enough space. This will also handle the resreve + * tracepoint for the reserved amount. + */ +int btrfs_inode_rsv_refill(struct btrfs_inode *inode, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_root *root = inode->root; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) + num_bytes = block_rsv->size - block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (num_bytes == 0) + return 0; + + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), num_bytes, 1); + } + return ret; +} + +/** + * btrfs_inode_rsv_release - release any excessive reservation. + * @inode - the inode we need to release from. + * + * This is the same as btrfs_block_rsv_release, except that it handles the + * tracepoint for the reservation. + */ +void btrfs_inode_rsv_release(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 released = 0; + + /* + * Since we statically set the block_rsv->size we just want to say we + * are releasing 0 bytes, and then we'll just get the reservation over + * the size free'd. + */ + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delalloc", + btrfs_ino(inode), released, 0); +} + void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes) @@ -5808,7 +5866,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; - fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; fs_info->delayed_block_rsv.space_info = space_info; @@ -5828,8 +5885,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, (u64)-1); - WARN_ON(fs_info->delalloc_block_rsv.size > 0); - WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); @@ -5841,12 +5896,15 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - if (!trans->block_rsv) + if (!trans->block_rsv) { + ASSERT(!trans->bytes_reserved); return; + } if (!trans->bytes_reserved) return; + ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, @@ -5968,104 +6026,37 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, btrfs_block_rsv_release(fs_info, rsv, (u64)-1); } -/** - * drop_outstanding_extent - drop an outstanding extent - * @inode: the inode we're dropping the extent for - * @num_bytes: the number of bytes we're releasing. - * - * This is called when we are freeing up an outstanding extent, either called - * after an error or after an extent is written. This will return the number of - * reserved extents that need to be freed. This must be called with - * BTRFS_I(inode)->lock held. - */ -static unsigned drop_outstanding_extent(struct btrfs_inode *inode, - u64 num_bytes) -{ - unsigned drop_inode_space = 0; - unsigned dropped_extents = 0; - unsigned num_extents; - - num_extents = count_max_extents(num_bytes); - ASSERT(num_extents); - ASSERT(inode->outstanding_extents >= num_extents); - inode->outstanding_extents -= num_extents; - - if (inode->outstanding_extents == 0 && - test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &inode->runtime_flags)) - drop_inode_space = 1; - - /* - * If we have more or the same amount of outstanding extents than we have - * reserved then we need to leave the reserved extents count alone. - */ - if (inode->outstanding_extents >= inode->reserved_extents) - return drop_inode_space; - - dropped_extents = inode->reserved_extents - inode->outstanding_extents; - inode->reserved_extents -= dropped_extents; - return dropped_extents + drop_inode_space; -} - -/** - * calc_csum_metadata_size - return the amount of metadata space that must be - * reserved/freed for the given bytes. - * @inode: the inode we're manipulating - * @num_bytes: the number of bytes in question - * @reserve: 1 if we are reserving space, 0 if we are freeing space - * - * This adjusts the number of csum_bytes in the inode and then returns the - * correct amount of metadata that must either be reserved or freed. We - * calculate how many checksums we can fit into one leaf and then divide the - * number of bytes that will need to be checksumed by this value to figure out - * how many checksums will be required. If we are adding bytes then the number - * may go up and we will return the number of additional bytes that must be - * reserved. If it is going down we will return the number of bytes that must - * be freed. - * - * This must be called with BTRFS_I(inode)->lock held. - */ -static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes, - int reserve) +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - u64 old_csums, num_csums; - - if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) - return 0; - - old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); - if (reserve) - inode->csum_bytes += num_bytes; - else - inode->csum_bytes -= num_bytes; - num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); - - /* No change, no need to reserve more */ - if (old_csums == num_csums) - return 0; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 reserve_size = 0; + u64 csum_leaves; + unsigned outstanding_extents; - if (reserve) - return btrfs_calc_trans_metadata_size(fs_info, - num_csums - old_csums); + lockdep_assert_held(&inode->lock); + outstanding_extents = inode->outstanding_extents; + if (outstanding_extents) + reserve_size = btrfs_calc_trans_metadata_size(fs_info, + outstanding_extents + 1); + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, + inode->csum_bytes); + reserve_size += btrfs_calc_trans_metadata_size(fs_info, + csum_leaves); - return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); + spin_lock(&block_rsv->lock); + block_rsv->size = reserve_size; + spin_unlock(&block_rsv->lock); } int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); struct btrfs_root *root = inode->root; - struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; - u64 to_reserve = 0; - u64 csum_bytes; unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; - u64 to_free = 0; - unsigned dropped; - bool release_extra = false; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc @@ -6091,19 +6082,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + /* Add our new extents and calculate the new rsv size. */ spin_lock(&inode->lock); nr_extents = count_max_extents(num_bytes); - inode->outstanding_extents += nr_extents; - - nr_extents = 0; - if (inode->outstanding_extents > inode->reserved_extents) - nr_extents += inode->outstanding_extents - - inode->reserved_extents; - - /* We always want to reserve a slot for updating the inode. */ - to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); - to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); - csum_bytes = inode->csum_bytes; + btrfs_mod_outstanding_extents(inode, nr_extents); + inode->csum_bytes += num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { @@ -6113,92 +6097,26 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) goto out_fail; } - ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); + ret = btrfs_inode_rsv_refill(inode, flush); if (unlikely(ret)) { btrfs_qgroup_free_meta(root, nr_extents * fs_info->nodesize); goto out_fail; } - spin_lock(&inode->lock); - if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &inode->runtime_flags)) { - to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); - release_extra = true; - } - inode->reserved_extents += nr_extents; - spin_unlock(&inode->lock); - if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); - - if (to_reserve) - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), to_reserve, 1); - if (release_extra) - btrfs_block_rsv_release(fs_info, block_rsv, - btrfs_calc_trans_metadata_size(fs_info, 1)); return 0; out_fail: spin_lock(&inode->lock); - dropped = drop_outstanding_extent(inode, num_bytes); - /* - * If the inodes csum_bytes is the same as the original - * csum_bytes then we know we haven't raced with any free()ers - * so we can just reduce our inodes csum bytes and carry on. - */ - if (inode->csum_bytes == csum_bytes) { - calc_csum_metadata_size(inode, num_bytes, 0); - } else { - u64 orig_csum_bytes = inode->csum_bytes; - u64 bytes; - - /* - * This is tricky, but first we need to figure out how much we - * freed from any free-ers that occurred during this - * reservation, so we reset ->csum_bytes to the csum_bytes - * before we dropped our lock, and then call the free for the - * number of bytes that were freed while we were trying our - * reservation. - */ - bytes = csum_bytes - inode->csum_bytes; - inode->csum_bytes = csum_bytes; - to_free = calc_csum_metadata_size(inode, bytes, 0); - - - /* - * Now we need to see how much we would have freed had we not - * been making this reservation and our ->csum_bytes were not - * artificially inflated. - */ - inode->csum_bytes = csum_bytes - num_bytes; - bytes = csum_bytes - orig_csum_bytes; - bytes = calc_csum_metadata_size(inode, bytes, 0); - - /* - * Now reset ->csum_bytes to what it should be. If bytes is - * more than to_free then we would have freed more space had we - * not had an artificially high ->csum_bytes, so we need to free - * the remainder. If bytes is the same or less then we don't - * need to do anything, the other free-ers did the correct - * thing. - */ - inode->csum_bytes = orig_csum_bytes - num_bytes; - if (bytes > to_free) - to_free = bytes - to_free; - else - to_free = 0; - } + nr_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, -nr_extents); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - if (dropped) - to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); - if (to_free) { - btrfs_block_rsv_release(fs_info, block_rsv, to_free); - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), to_free, 0); - } + btrfs_inode_rsv_release(inode); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); return ret; @@ -6206,36 +6124,55 @@ out_fail: /** * btrfs_delalloc_release_metadata - release a metadata reservation for an inode - * @inode: the inode to release the reservation for - * @num_bytes: the number of bytes we're releasing + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. * * This will release the metadata reservation for an inode. This can be called * once we complete IO for a given set of bytes to release their metadata - * reservations. + * reservations, or on error for the same reason. */ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - u64 to_free = 0; - unsigned dropped; num_bytes = ALIGN(num_bytes, fs_info->sectorsize); spin_lock(&inode->lock); - dropped = drop_outstanding_extent(inode, num_bytes); - - if (num_bytes) - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - if (dropped > 0) - to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); if (btrfs_is_testing(fs_info)) return; - trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), - to_free, 0); + btrfs_inode_rsv_release(inode); +} + +/** + * btrfs_delalloc_release_extents - release our outstanding_extents + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with + * + * When we reserve space we increase outstanding_extents for the extents we may + * add. Once we've set the range as delalloc or created our ordered extents we + * have outstanding_extents to track the real usage, so we use this to free our + * temporarily tracked outstanding_extents. This _must_ be used in conjunction + * with btrfs_delalloc_reserve_metadata. + */ +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + unsigned num_extents; + + spin_lock(&inode->lock); + num_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, -num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; - btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); + btrfs_inode_rsv_release(inode); } /** @@ -6282,10 +6219,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, * @inode: inode we're releasing space for * @start: start position of the space already reserved * @len: the len of the space already reserved - * - * This must be matched with a call to btrfs_delalloc_reserve_space. This is - * called in the case that we don't need the metadata AND data reservations - * anymore. So if there is an error or we insert an inline extent. + * @release_bytes: the len of the space we consumed or didn't use * * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes @@ -6293,7 +6227,8 @@ int btrfs_delalloc_reserve_space(struct inode *inode, * Also it will handle the qgroup reserved space. */ void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len); btrfs_free_reserved_data_space(inode, reserved, start, len); @@ -6958,7 +6893,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (is_data) - skinny_metadata = 0; + skinny_metadata = false; ret = lookup_extent_backref(trans, info, path, &iref, bytenr, num_bytes, parent, @@ -7213,7 +7148,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (!list_empty(&head->ref_list)) + if (!RB_EMPTY_ROOT(&head->ref_tree)) goto out; if (head->extent_op) { @@ -7234,9 +7169,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * at this point we have a head with no other entries. Go * ahead and process it. */ - head->node.in_tree = 0; rb_erase(&head->href_node, &delayed_refs->href_root); - + RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); /* @@ -7255,7 +7189,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, ret = 1; mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return ret; out: spin_unlock(&head->lock); @@ -7277,6 +7211,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { int old_ref_mod, new_ref_mod; + btrfs_ref_tree_mod(root, buf->start, buf->len, parent, + root->root_key.objectid, + btrfs_header_level(buf), 0, + BTRFS_DROP_DELAYED_REF); ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, buf->len, parent, root->root_key.objectid, @@ -7329,16 +7267,21 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + struct btrfs_fs_info *fs_info = root->fs_info; int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) + btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, + root_objectid, owner, offset, + BTRFS_DROP_DELAYED_REF); /* * tree log blocks never actually go into the extent allocation @@ -8306,17 +8249,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, + struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { - struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_fs_info *fs_info = root->fs_info; int ret; - BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + + btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0, + root->root_key.objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT); ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, - ins->offset, 0, root_objectid, owner, + ins->offset, 0, + root->root_key.objectid, owner, offset, ram_bytes, BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); return ret; @@ -8538,6 +8486,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; + btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent, + root_objectid, level, 0, + BTRFS_ADD_DELAYED_EXTENT); ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, ins.offset, parent, root_objectid, level, @@ -8894,7 +8845,7 @@ skip: ret); } } - ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0); if (ret) @@ -9311,7 +9262,7 @@ out: * don't have it in the radix (like when we recover after a power fail * or unmount) so we don't leak memory. */ - if (!for_reloc && root_dropped == false) + if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); if (err && err != -EAGAIN) btrfs_handle_fs_error(fs_info, err, NULL); @@ -9968,9 +9919,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) return 0; } -static void __link_block_group(struct btrfs_space_info *space_info, - struct btrfs_block_group_cache *cache) +static void link_block_group(struct btrfs_block_group_cache *cache) { + struct btrfs_space_info *space_info = cache->space_info; int index = get_block_group_index(cache); bool first = false; @@ -10178,7 +10129,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) cache->space_info = space_info; - __link_block_group(space_info, cache); + link_block_group(cache); set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_readonly(info, cache->key.objectid)) { @@ -10337,7 +10288,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->bytes_super, &cache->space_info); update_global_block_rsv(fs_info); - __link_block_group(cache->space_info, cache); + link_block_group(cache); list_add_tail(&cache->bg_list, &trans->new_bgs); @@ -10387,6 +10338,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * remove it. */ free_excluded_extents(fs_info, block_group); + btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, + block_group->key.offset); memcpy(&key, &block_group->key, sizeof(key)); index = get_block_group_index(block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 970190cd347e..adbbc017191c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/slab.h> #include <linux/bio.h> @@ -109,7 +110,6 @@ struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; get_extent_t *get_extent; - unsigned long bio_flags; /* tells writepage not to lock the state bits for this range * it still does the unlocking @@ -2761,8 +2761,8 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page, */ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, struct writeback_control *wbc, - struct page *page, sector_t sector, - size_t size, unsigned long offset, + struct page *page, u64 offset, + size_t size, unsigned long pg_offset, struct block_device *bdev, struct bio **bio_ret, bio_end_io_t end_io_func, @@ -2776,6 +2776,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, int contig = 0; int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; size_t page_size = min_t(size_t, size, PAGE_SIZE); + sector_t sector = offset >> 9; if (bio_ret && *bio_ret) { bio = *bio_ret; @@ -2786,8 +2787,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, if (prev_bio_flags != bio_flags || !contig || force_bio_submit || - merge_bio(tree, page, offset, page_size, bio, bio_flags) || - bio_add_page(bio, page, page_size, offset) < page_size) { + merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) || + bio_add_page(bio, page, page_size, pg_offset) < page_size) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; @@ -2801,8 +2802,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, (u64)sector << 9); - bio_add_page(bio, page, page_size, offset); + bio = btrfs_bio_alloc(bdev, offset); + bio_add_page(bio, page, page_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; bio->bi_write_hint = page->mapping->host->i_write_hint; @@ -2892,7 +2893,6 @@ static int __do_readpage(struct extent_io_tree *tree, u64 last_byte = i_size_read(inode); u64 block_start; u64 cur_end; - sector_t sector; struct extent_map *em; struct block_device *bdev; int ret = 0; @@ -2928,6 +2928,7 @@ static int __do_readpage(struct extent_io_tree *tree, } while (cur <= end) { bool force_bio_submit = false; + u64 offset; if (cur >= last_byte) { char *userpage; @@ -2967,9 +2968,9 @@ static int __do_readpage(struct extent_io_tree *tree, iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) { disk_io_size = em->block_len; - sector = em->block_start >> 9; + offset = em->block_start; } else { - sector = (em->block_start + extent_offset) >> 9; + offset = em->block_start + extent_offset; disk_io_size = iosize; } bdev = em->bdev; @@ -3062,8 +3063,8 @@ static int __do_readpage(struct extent_io_tree *tree, } ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, - page, sector, disk_io_size, pg_offset, - bdev, bio, + page, offset, disk_io_size, + pg_offset, bdev, bio, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag, @@ -3324,7 +3325,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 extent_offset; u64 block_start; u64 iosize; - sector_t sector; struct extent_map *em; struct block_device *bdev; size_t pg_offset = 0; @@ -3367,6 +3367,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, while (cur <= end) { u64 em_end; + u64 offset; if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) @@ -3388,7 +3389,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, BUG_ON(end < cur); iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); - sector = (em->block_start + extent_offset) >> 9; + offset = em->block_start + extent_offset; bdev = em->bdev; block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -3431,7 +3432,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, } ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, - page, sector, iosize, pg_offset, + page, offset, iosize, pg_offset, bdev, &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); @@ -3715,7 +3716,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; u32 nritems; unsigned long i, num_pages; - unsigned long bio_flags = 0; unsigned long start, end; unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; int ret = 0; @@ -3723,8 +3723,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); atomic_set(&eb->io_pages, num_pages); - if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) - bio_flags = EXTENT_BIO_TREE_LOG; /* set btree blocks beyond nritems with 0 to avoid stale content. */ nritems = btrfs_header_nritems(eb); @@ -3748,11 +3746,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, - p, offset >> 9, PAGE_SIZE, 0, bdev, + p, offset, PAGE_SIZE, 0, bdev, &epd->bio, end_bio_extent_buffer_writepage, - 0, epd->bio_flags, bio_flags, false); - epd->bio_flags = bio_flags; + 0, 0, 0, false); if (ret) { set_btree_ioerr(p); if (PageWriteback(p)) @@ -3789,7 +3786,6 @@ int btree_write_cache_pages(struct address_space *mapping, .tree = tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, - .bio_flags = 0, }; int ret = 0; int done = 0; @@ -4062,7 +4058,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd) if (epd->bio) { int ret; - ret = submit_one_bio(epd->bio, 0, epd->bio_flags); + ret = submit_one_bio(epd->bio, 0, 0); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } @@ -4085,7 +4081,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, - .bio_flags = 0, }; ret = __extent_writepage(page, wbc, &epd); @@ -4110,7 +4105,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, - .bio_flags = 0, }; struct writeback_control wbc_writepages = { .sync_mode = mode, @@ -4150,7 +4144,6 @@ int extent_writepages(struct extent_io_tree *tree, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, - .bio_flags = 0, }; ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index faffa28ba707..4a8861379d3e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __EXTENTIO__ #define __EXTENTIO__ @@ -33,7 +34,6 @@ * type for this bio */ #define EXTENT_BIO_COMPRESSED 1 -#define EXTENT_BIO_TREE_LOG 2 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 69850155870c..2e348fb0b280 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index a67b2def5413..64365bbc9b16 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __EXTENTMAP__ #define __EXTENTMAP__ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index aafcc785f840..f80254d82f40 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -856,7 +856,7 @@ next_slot: btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) { - ret = btrfs_inc_extent_ref(trans, fs_info, + ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, @@ -940,7 +940,7 @@ delete_extent_item: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - ret = btrfs_free_extent(trans, fs_info, + ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - @@ -1234,7 +1234,7 @@ again: extent_end - split); btrfs_mark_buffer_dirty(leaf); - ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes, + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1268,7 +1268,7 @@ again: extent_end = other_end; del_slot = path->slots[0] + 1; del_nr++; - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1288,7 +1288,7 @@ again: key.offset = other_start; del_slot = path->slots[0]; del_nr++; - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1590,7 +1590,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, int ret = 0; bool only_release_metadata = false; bool force_page_uptodate = false; - bool need_unlock; nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), PAGE_SIZE / (sizeof(struct page *))); @@ -1613,6 +1612,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, size_t copied; size_t dirty_sectors; size_t num_sectors; + int extents_locked; WARN_ON(num_pages > nrptrs); @@ -1656,6 +1656,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } } + WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), reserve_bytes); if (ret) { @@ -1669,7 +1670,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } release_bytes = reserve_bytes; - need_unlock = false; again: /* * This is going to setup the pages array with the number of @@ -1679,19 +1679,23 @@ again: ret = prepare_pages(inode, pages, num_pages, pos, write_bytes, force_page_uptodate); - if (ret) + if (ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), + reserve_bytes); break; + } - ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages, + extents_locked = lock_and_cleanup_extent_if_need( + BTRFS_I(inode), pages, num_pages, pos, write_bytes, &lockstart, &lockend, &cached_state); - if (ret < 0) { - if (ret == -EAGAIN) + if (extents_locked < 0) { + if (extents_locked == -EAGAIN) goto again; + btrfs_delalloc_release_extents(BTRFS_I(inode), + reserve_bytes); + ret = extents_locked; break; - } else if (ret > 0) { - need_unlock = true; - ret = 0; } copied = btrfs_copy_from_user(pos, write_bytes, pages, i); @@ -1718,23 +1722,10 @@ again: PAGE_SIZE); } - /* - * If we had a short copy we need to release the excess delaloc - * bytes we reserved. We need to increment outstanding_extents - * because btrfs_delalloc_release_space and - * btrfs_delalloc_release_metadata will decrement it, but - * we still have an outstanding extent for the chunk we actually - * managed to copy. - */ if (num_sectors > dirty_sectors) { /* release everything except the sectors we dirtied */ release_bytes -= dirty_sectors << fs_info->sb->s_blocksize_bits; - if (copied > 0) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - } if (only_release_metadata) { btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); @@ -1756,10 +1747,11 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, pos, copied, NULL); - if (need_unlock) + if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { btrfs_drop_pages(pages, num_pages); break; @@ -2046,7 +2038,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; - bool full_sync = 0; + bool full_sync = false; u64 len; /* diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 684f12247db7..fe5e0324dca9 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1286,12 +1286,8 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { - u64 start, end; int ret; - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; - block_group->needs_free_space = 0; ret = add_new_free_space_info(trans, fs_info, block_group, path); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d02019747d00..022b19336fee 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -500,11 +500,12 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); goto out_put; } ret = btrfs_write_out_ino_cache(root, trans, path, inode); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); out_put: iput(inode); out_release: diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h index c8e864b2d530..6734ec92a1e9 100644 --- a/fs/btrfs/inode-map.h +++ b/fs/btrfs/inode-map.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __BTRFS_INODE_MAP #define __BTRFS_INODE_MAP diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d94e3f68b9b1..b93fe05a39c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -42,6 +42,7 @@ #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> #include <linux/uio.h> +#include <linux/magic.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -67,7 +68,6 @@ struct btrfs_iget_args { }; struct btrfs_dio_data { - u64 outstanding_extents; u64 reserve; u64 unsubmitted_oe_range_start; u64 unsubmitted_oe_range_end; @@ -316,7 +316,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_free_path(path); return PTR_ERR(trans); } - trans->block_rsv = &fs_info->delalloc_block_rsv; + trans->block_rsv = &BTRFS_I(inode)->block_rsv; if (compressed_size && compressed_pages) extent_item_size = btrfs_file_extent_calc_inline_size( @@ -348,7 +348,6 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, } set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start); btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); out: /* @@ -458,7 +457,6 @@ static noinline void compress_file_range(struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - u64 num_bytes; u64 blocksize = fs_info->sectorsize; u64 actual_end; u64 isize = i_size_read(inode); @@ -508,8 +506,6 @@ again: total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); - num_bytes = ALIGN(end - start + 1, blocksize); - num_bytes = max(blocksize, num_bytes); total_in = 0; ret = 0; @@ -542,7 +538,10 @@ again: */ extent_range_clear_dirty_for_io(inode, start, end); redirty = 1; - ret = btrfs_compress_pages(compress_type, + + /* Compression level is applied here and only here */ + ret = btrfs_compress_pages( + compress_type | (fs_info->compress_level << 4), inode->i_mapping, start, pages, &nr_pages, @@ -570,7 +569,7 @@ again: cont: if (start == 0) { /* lets try to make an inline extent */ - if (ret || total_in < (actual_end - start)) { + if (ret || total_in < actual_end) { /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ @@ -584,16 +583,21 @@ cont: } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG; + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING; unsigned long page_error_op; - clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; /* * inline extent creation worked or returned error, * we don't need to create any more async work items. * Unlock and free up our temp pages. + * + * We use DO_ACCOUNTING here because we need the + * delalloc_release_metadata to be done _after_ we drop + * our outstanding extent for clearing delalloc for this + * range. */ extent_clear_unlock_delalloc(inode, start, end, end, NULL, clear_flags, @@ -602,10 +606,6 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - if (ret == 0) - btrfs_free_reserved_data_space_noquota(inode, - start, - end - start + 1); goto free_pages_out; } } @@ -625,7 +625,6 @@ cont: */ total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed + blocksize <= total_in) { - num_bytes = total_in; *num_added += 1; /* @@ -633,12 +632,12 @@ cont: * allocation on disk for these compressed pages, and * will submit them to the elevator. */ - add_async_extent(async_cow, start, num_bytes, + add_async_extent(async_cow, start, total_in, total_compressed, pages, nr_pages, compress_type); - if (start + num_bytes < end) { - start += num_bytes; + if (start + total_in < end) { + start += total_in; pages = NULL; cond_resched(); goto again; @@ -982,15 +981,19 @@ static noinline int cow_file_range(struct inode *inode, ret = cow_file_range_inline(root, inode, start, end, 0, BTRFS_COMPRESS_NONE, NULL); if (ret == 0) { + /* + * We use DO_ACCOUNTING here because we need the + * delalloc_release_metadata to be run _after_ we drop + * our outstanding extent for clearing delalloc for this + * range. + */ extent_clear_unlock_delalloc(inode, start, end, delalloc_end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG, PAGE_UNLOCK | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); - btrfs_free_reserved_data_space_noquota(inode, start, - end - start + 1); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -1226,13 +1229,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); - while (atomic_read(&fs_info->async_submit_draining) && - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->async_delalloc_pages) == - 0)); - } - *nr_written += nr_pages; start = cur_end + 1; } @@ -1635,7 +1631,7 @@ static void btrfs_split_extent_hook(void *private_data, } spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); } @@ -1665,7 +1661,7 @@ static void btrfs_merge_extent_hook(void *private_data, /* we're not bigger than the max, unreserve the space and go */ if (new_size <= BTRFS_MAX_EXTENT_SIZE) { spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; + btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); return; } @@ -1696,7 +1692,7 @@ static void btrfs_merge_extent_hook(void *private_data, return; spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; + btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); } @@ -1766,15 +1762,12 @@ static void btrfs_set_bit_hook(void *private_data, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; + u32 num_extents = count_max_extents(len); bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); - if (*bits & EXTENT_FIRST_DELALLOC) { - *bits &= ~EXTENT_FIRST_DELALLOC; - } else { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - } + spin_lock(&BTRFS_I(inode)->lock); + btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); + spin_unlock(&BTRFS_I(inode)->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) @@ -1828,13 +1821,9 @@ static void btrfs_clear_bit_hook(void *private_data, struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); - if (*bits & EXTENT_FIRST_DELALLOC) { - *bits &= ~EXTENT_FIRST_DELALLOC; - } else if (!(*bits & EXTENT_CLEAR_META_RESV)) { - spin_lock(&inode->lock); - inode->outstanding_extents -= num_extents; - spin_unlock(&inode->lock); - } + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -num_extents); + spin_unlock(&inode->lock); /* * We don't reserve metadata space for space cache inodes so we @@ -2105,6 +2094,7 @@ again: 0); ClearPageChecked(page); set_page_dirty(page); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -2229,8 +2219,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret < 0) goto out; qg_released = ret; - ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins); + ret = btrfs_alloc_reserved_file_extent(trans, root, + btrfs_ino(BTRFS_I(inode)), + file_pos, qg_released, &ins); out: btrfs_free_path(path); @@ -2464,7 +2455,7 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, ret = iterate_inodes_from_logical(old->bytenr + old->extent_offset, fs_info, path, record_one_backref, - old); + old, false); if (ret < 0 && ret != -ENOENT) return false; @@ -2682,7 +2673,7 @@ again: inode_add_bytes(inode, len); btrfs_release_path(path); - ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr, + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, new->disk_len, 0, backref->root_id, backref->inum, new->file_pos); /* start - extent_offset */ @@ -2964,7 +2955,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = NULL; goto out; } - trans->block_rsv = &fs_info->delalloc_block_rsv; + trans->block_rsv = &BTRFS_I(inode)->block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); @@ -3000,7 +2991,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - trans->block_rsv = &fs_info->delalloc_block_rsv; + trans->block_rsv = &BTRFS_I(inode)->block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; @@ -3058,9 +3049,6 @@ out: 0, &cached_state, GFP_NOFS); } - if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - ordered_extent->len); if (trans) btrfs_end_transaction(trans); @@ -4372,47 +4360,11 @@ static int truncate_space_check(struct btrfs_trans_handle *trans, } -static int truncate_inline_extent(struct inode *inode, - struct btrfs_path *path, - struct btrfs_key *found_key, - const u64 item_end, - const u64 new_size) -{ - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - struct btrfs_file_extent_item *fi; - u32 size = (u32)(new_size - found_key->offset); - struct btrfs_root *root = BTRFS_I(inode)->root; - - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - - if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { - loff_t offset = new_size; - loff_t page_end = ALIGN(offset, PAGE_SIZE); - - /* - * Zero out the remaining of the last page of our inline extent, - * instead of directly truncating our inline extent here - that - * would be much more complex (decompressing all the data, then - * compressing the truncated data, which might be bigger than - * the size of the inline extent, resize the extent, etc). - * We release the path because to get the page we might need to - * read the extent item from disk (data not in the page cache). - */ - btrfs_release_path(path); - return btrfs_truncate_block(inode, offset, page_end - offset, - 0); - } - - btrfs_set_file_extent_ram_bytes(leaf, fi, size); - size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root->fs_info, path, size, 1); - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - inode_sub_bytes(inode, item_end + 1 - new_size); - - return 0; -} +/* + * Return this if we need to call truncate_block for the last bit of the + * truncate. + */ +#define NEED_TRUNCATE_BLOCK 1 /* * this can truncate away extent items, csum items and directory items. @@ -4451,9 +4403,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int err = 0; u64 ino = btrfs_ino(BTRFS_I(inode)); u64 bytes_deleted = 0; - bool be_nice = 0; - bool should_throttle = 0; - bool should_end = 0; + bool be_nice = false; + bool should_throttle = false; + bool should_end = false; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4463,7 +4415,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, */ if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - be_nice = 1; + be_nice = true; path = btrfs_alloc_path(); if (!path) @@ -4573,11 +4525,6 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; - if (del_item) - last_size = found_key.offset; - else - last_size = new_size; - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); @@ -4619,40 +4566,30 @@ search_again: */ if (!del_item && btrfs_file_extent_encryption(leaf, fi) == 0 && - btrfs_file_extent_other_encoding(leaf, fi) == 0) { - + btrfs_file_extent_other_encoding(leaf, fi) == 0 && + btrfs_file_extent_compression(leaf, fi) == 0) { + u32 size = (u32)(new_size - found_key.offset); + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(root->fs_info, path, size, 1); + } else if (!del_item) { /* - * Need to release path in order to truncate a - * compressed extent. So delete any accumulated - * extent items so far. + * We have to bail so the last_size is set to + * just before this extent. */ - if (btrfs_file_extent_compression(leaf, fi) != - BTRFS_COMPRESS_NONE && pending_del_nr) { - err = btrfs_del_items(trans, root, path, - pending_del_slot, - pending_del_nr); - if (err) { - btrfs_abort_transaction(trans, - err); - goto error; - } - pending_del_nr = 0; - } + err = NEED_TRUNCATE_BLOCK; + break; + } - err = truncate_inline_extent(inode, path, - &found_key, - item_end, - new_size); - if (err) { - btrfs_abort_transaction(trans, err); - goto error; - } - } else if (test_bit(BTRFS_ROOT_REF_COWS, - &root->state)) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); - } } delete: + if (del_item) + last_size = found_key.offset; + else + last_size = new_size; if (del_item) { if (!pending_del_nr) { /* no pending yet, add ourselves */ @@ -4669,14 +4606,14 @@ delete: } else { break; } - should_throttle = 0; + should_throttle = false; if (found_extent && (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == fs_info->tree_root)) { btrfs_set_path_blocking(path); bytes_deleted += extent_num_bytes; - ret = btrfs_free_extent(trans, fs_info, extent_start, + ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), ino, extent_offset); @@ -4688,11 +4625,11 @@ delete: if (be_nice) { if (truncate_space_check(trans, root, extent_num_bytes)) { - should_end = 1; + should_end = true; } if (btrfs_should_throttle_delayed_refs(trans, fs_info)) - should_throttle = 1; + should_throttle = true; } } @@ -4801,8 +4738,11 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, (!len || ((len & (blocksize - 1)) == 0))) goto out; + block_start = round_down(from, blocksize); + block_end = block_start + blocksize - 1; + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - round_down(from, blocksize), blocksize); + block_start, blocksize); if (ret) goto out; @@ -4810,15 +4750,12 @@ again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, data_reserved, - round_down(from, blocksize), - blocksize); + block_start, blocksize); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); ret = -ENOMEM; goto out; } - block_start = round_down(from, blocksize); - block_end = block_start + blocksize - 1; - if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); lock_page(page); @@ -4883,6 +4820,7 @@ out_unlock: if (ret) btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: @@ -7797,33 +7735,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, return em; } -static void adjust_dio_outstanding_extents(struct inode *inode, - struct btrfs_dio_data *dio_data, - const u64 len) -{ - unsigned num_extents = count_max_extents(len); - - /* - * If we have an outstanding_extents count still set then we're - * within our reservation, otherwise we need to adjust our inode - * counter appropriately. - */ - if (dio_data->outstanding_extents >= num_extents) { - dio_data->outstanding_extents -= num_extents; - } else { - /* - * If dio write length has been split due to no large enough - * contiguous space, we need to compensate our inode counter - * appropriately. - */ - u64 num_needed = num_extents - dio_data->outstanding_extents; - - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents += num_needed; - spin_unlock(&BTRFS_I(inode)->lock); - } -} - static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -7985,7 +7896,6 @@ unlock: if (!dio_data->overwrite && start + len > i_size_read(inode)) i_size_write(inode, start + len); - adjust_dio_outstanding_extents(inode, dio_data, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; dio_data->unsubmitted_oe_range_end = start + len; @@ -8015,14 +7925,6 @@ unlock_err: err: if (dio_data) current->journal_info = dio_data; - /* - * Compensate the delalloc release we do in btrfs_direct_IO() when we - * write less data then expected, so that we don't underflow our inode's - * outstanding extents counter. - */ - if (create && dio_data) - adjust_dio_outstanding_extents(inode, dio_data, len); - return ret; } @@ -8495,7 +8397,7 @@ static void btrfs_end_dio_bio(struct bio *bio) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - dip->dio_bio->bi_status = 0; + dip->dio_bio->bi_status = BLK_STS_OK; bio_endio(dip->orig_bio); } out: @@ -8577,7 +8479,7 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, goto err; } map: - ret = btrfs_map_bio(fs_info, bio, 0, async_submit); + ret = btrfs_map_bio(fs_info, bio, 0, 0); err: bio_put(bio); return ret; @@ -8786,7 +8688,6 @@ free_ordered: } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, - struct kiocb *iocb, const struct iov_iter *iter, loff_t offset) { int seg; @@ -8833,7 +8734,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) bool relock = false; ssize_t ret; - if (check_direct_IO(fs_info, iocb, iter, offset)) + if (check_direct_IO(fs_info, iter, offset)) return 0; inode_dio_begin(inode); @@ -8868,7 +8769,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) offset, count); if (ret) goto out; - dio_data.outstanding_extents = count_max_extents(count); /* * We need to know how many extents we reserved so that we can @@ -8915,6 +8815,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, data_reserved, offset, count - (size_t)ret); + btrfs_delalloc_release_extents(BTRFS_I(inode), count); } out: if (wakeup) @@ -9232,9 +9133,6 @@ again: fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE - reserved_space); } @@ -9286,12 +9184,14 @@ again: out_unlock: if (!ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); out: + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, reserved_space); out_noreserve: @@ -9387,12 +9287,12 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); + trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) { err = ret; break; } - trans->block_rsv = &fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret) { err = ret; @@ -9416,6 +9316,27 @@ static int btrfs_truncate(struct inode *inode) trans->block_rsv = rsv; } + /* + * We can't call btrfs_truncate_block inside a trans handle as we could + * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know + * we've truncated everything except the last little bit, and can do + * btrfs_truncate_block and then update the disk_i_size. + */ + if (ret == NEED_TRUNCATE_BLOCK) { + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + + ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); + if (ret) + goto out; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + } + if (ret == 0 && inode->i_nlink > 0) { trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, BTRFS_I(inode)); @@ -9480,6 +9401,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct inode *btrfs_alloc_inode(struct super_block *sb) { + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; @@ -9506,8 +9428,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) spin_lock_init(&ei->lock); ei->outstanding_extents = 0; - ei->reserved_extents = 0; - + if (sb->s_magic != BTRFS_TEST_MAGIC) + btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, + BTRFS_BLOCK_RSV_DELALLOC); ei->runtime_flags = 0; ei->prop_compress = BTRFS_COMPRESS_NONE; ei->defrag_compress = BTRFS_COMPRESS_NONE; @@ -9557,8 +9480,9 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!hlist_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + WARN_ON(BTRFS_I(inode)->block_rsv.reserved); + WARN_ON(BTRFS_I(inode)->block_rsv.size); WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); @@ -10337,19 +10261,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) ret = __start_delalloc_inodes(root, delay_iput, -1); if (ret > 0) ret = 0; - /* - * the filemap_flush will queue IO into the worker threads, but - * we have to make sure the IO is actually started and that - * ordered extents get created before we return - */ - atomic_inc(&fs_info->async_submit_draining); - while (atomic_read(&fs_info->nr_async_submits) || - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0 && - atomic_read(&fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&fs_info->async_submit_draining); return ret; } @@ -10391,14 +10302,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, spin_unlock(&fs_info->delalloc_root_lock); ret = 0; - atomic_inc(&fs_info->async_submit_draining); - while (atomic_read(&fs_info->nr_async_submits) || - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0 && - atomic_read(&fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&fs_info->async_submit_draining); out: if (!list_empty_careful(&splice)) { spin_lock(&fs_info->delalloc_root_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6c7a49faf4e0..fd172a93d11a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -86,6 +86,19 @@ struct btrfs_ioctl_received_subvol_args_32 { struct btrfs_ioctl_received_subvol_args_32) #endif +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) +struct btrfs_ioctl_send_args_32 { + __s64 send_fd; /* in */ + __u64 clone_sources_count; /* in */ + compat_uptr_t clone_sources; /* in */ + __u64 parent_root; /* in */ + __u64 flags; /* in */ + __u64 reserved[4]; /* in */ +} __attribute__ ((__packed__)); + +#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ + struct btrfs_ioctl_send_args_32) +#endif static int btrfs_clone(struct inode *src, struct inode *inode, u64 off, u64 olen, u64 olen_aligned, u64 destoff, @@ -609,23 +622,6 @@ fail_free: return ret; } -static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root) -{ - s64 writers; - DEFINE_WAIT(wait); - - do { - prepare_to_wait(&root->subv_writers->wait, &wait, - TASK_UNINTERRUPTIBLE); - - writers = percpu_counter_sum(&root->subv_writers->counter); - if (writers) - schedule(); - - finish_wait(&root->subv_writers->wait, &wait); - } while (writers); -} - static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, u64 *async_transid, bool readonly, @@ -654,7 +650,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, atomic_inc(&root->will_be_snapshotted); smp_mb__after_atomic(); - btrfs_wait_for_no_snapshotting_writes(root); + /* wait for no snapshot writes */ + wait_event(root->subv_writers->wait, + percpu_counter_sum(&root->subv_writers->counter) == 0); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) @@ -1219,6 +1217,7 @@ again: unlock_page(pages[i]); put_page(pages[i]); } + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; out: @@ -1229,6 +1228,7 @@ out: btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; @@ -1420,21 +1420,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, filemap_flush(inode->i_mapping); } - if (do_compress) { - /* the filemap_flush will queue IO into the worker threads, but - * we have to make sure the IO is actually started and that - * ordered extents get created before we return - */ - atomic_inc(&fs_info->async_submit_draining); - while (atomic_read(&fs_info->nr_async_submits) || - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0 && - atomic_read(&fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&fs_info->async_submit_draining); - } - if (range->compress_type == BTRFS_COMPRESS_LZO) { btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { @@ -1842,8 +1827,13 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + if (ret < 0) { + btrfs_end_transaction(trans); + goto out_reset; + } + + ret = btrfs_commit_transaction(trans); - btrfs_commit_transaction(trans); out_reset: if (ret) btrfs_set_root_flags(&root->root_item, root_flags); @@ -2179,7 +2169,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, inode = file_inode(file); ret = search_ioctl(inode, &args.key, &buf_size, - (char *)(&uarg->buf[0])); + (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; else if (ret == -EOVERFLOW && @@ -3706,7 +3696,7 @@ process_slot: if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, - fs_info, + root, disko, diskl, 0, root->root_key.objectid, btrfs_ino(BTRFS_I(inode)), @@ -4129,10 +4119,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_space_info *dest_orig; struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; - u64 types[] = {BTRFS_BLOCK_GROUP_DATA, - BTRFS_BLOCK_GROUP_SYSTEM, - BTRFS_BLOCK_GROUP_METADATA, - BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + static const u64 types[] = { + BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA + }; int num_types = 4; int alloc_size; int ret = 0; @@ -4504,8 +4496,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) ipath->fspath->val[i] = rel_ptr; } - ret = copy_to_user((void *)(unsigned long)ipa->fspath, - (void *)(unsigned long)ipath->fspath, size); + ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, + ipath->fspath, size); if (ret) { ret = -EFAULT; goto out; @@ -4540,13 +4532,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) } static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, - void __user *arg) + void __user *arg, int version) { int ret = 0; int size; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; + bool ignore_offset; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4555,13 +4548,30 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, if (IS_ERR(loi)) return PTR_ERR(loi); + if (version == 1) { + ignore_offset = false; + size = min_t(u32, loi->size, SZ_64K); + } else { + /* All reserved bits must be 0 for now */ + if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) { + ret = -EINVAL; + goto out_loi; + } + /* Only accept flags we have defined so far */ + if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) { + ret = -EINVAL; + goto out_loi; + } + ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET; + size = min_t(u32, loi->size, SZ_16M); + } + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } - size = min_t(u32, loi->size, SZ_64K); inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); @@ -4570,20 +4580,21 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - build_ino_list, inodes); + build_ino_list, inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) goto out; - ret = copy_to_user((void *)(unsigned long)loi->inodes, - (void *)(unsigned long)inodes, size); + ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes, + size); if (ret) ret = -EFAULT; out: btrfs_free_path(path); kvfree(inodes); +out_loi: kfree(loi); return ret; @@ -5160,15 +5171,11 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); goto out; } } ret = btrfs_commit_transaction(trans); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - goto out; - } - out: up_write(&fs_info->subvol_sem); mnt_drop_write_file(file); @@ -5490,6 +5497,41 @@ out_drop_write: return ret; } +static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) +{ + struct btrfs_ioctl_send_args *arg; + int ret; + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_send_args_32 args32; + + ret = copy_from_user(&args32, argp, sizeof(args32)); + if (ret) + return -EFAULT; + arg = kzalloc(sizeof(*arg), GFP_KERNEL); + if (!arg) + return -ENOMEM; + arg->send_fd = args32.send_fd; + arg->clone_sources_count = args32.clone_sources_count; + arg->clone_sources = compat_ptr(args32.clone_sources); + arg->parent_root = args32.parent_root; + arg->flags = args32.flags; + memcpy(arg->reserved, args32.reserved, + sizeof(args32.reserved)); +#else + return -ENOTTY; +#endif + } else { + arg = memdup_user(argp, sizeof(*arg)); + if (IS_ERR(arg)) + return PTR_ERR(arg); + } + ret = btrfs_ioctl_send(file, arg); + kfree(arg); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -5554,7 +5596,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: - return btrfs_ioctl_logical_to_ino(fs_info, argp); + return btrfs_ioctl_logical_to_ino(fs_info, argp, 1); + case BTRFS_IOC_LOGICAL_INO_V2: + return btrfs_ioctl_logical_to_ino(fs_info, argp, 2); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(fs_info, argp); case BTRFS_IOC_SYNC: { @@ -5595,7 +5639,11 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return btrfs_ioctl_send(file, argp); + return _btrfs_ioctl_send(file, argp, false); +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_SEND_32: + return _btrfs_ioctl_send(file, argp, true); +#endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); case BTRFS_IOC_QUOTA_CTL: diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d433e75d489a..6c7f18cd3b61 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -430,10 +430,15 @@ out: return ret; } +static void lzo_set_level(struct list_head *ws, unsigned int type) +{ +} + const struct btrfs_compress_op btrfs_lzo_compress = { .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, .decompress_bio = lzo_decompress_bio, .decompress = lzo_decompress, + .set_level = lzo_set_level, }; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a3aca495e33e..5b311aeddcc8 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -242,6 +242,15 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, } spin_unlock(&root->ordered_extent_lock); + /* + * We don't need the count_max_extents here, we can assume that all of + * that work has been done at higher layers, so this is truly the + * smallest the extent is going to get. + */ + spin_lock(&BTRFS_I(inode)->lock); + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); + spin_unlock(&BTRFS_I(inode)->lock); + return 0; } @@ -591,11 +600,19 @@ void btrfs_remove_ordered_extent(struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; struct rb_node *node; bool dec_pending_ordered = false; - tree = &BTRFS_I(inode)->ordered_tree; + /* This is paired with btrfs_add_ordered_extent. */ + spin_lock(&btrfs_inode->lock); + btrfs_mod_outstanding_extents(btrfs_inode, -1); + spin_unlock(&btrfs_inode->lock); + if (root != fs_info->tree_root) + btrfs_delalloc_release_metadata(btrfs_inode, entry->len); + + tree = &btrfs_inode->ordered_tree; spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e172d4843eae..168fd03ca3ac 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1441,7 +1441,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, u64 bytenr = qrecord->bytenr; int ret; - ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root); + ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false); if (ret < 0) return ret; @@ -2031,7 +2031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, /* Search commit root to find old_roots */ ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, - &record->old_roots); + &record->old_roots, false); if (ret < 0) goto cleanup; } @@ -2042,7 +2042,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, * root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, SEQ_LAST, &new_roots); + record->bytenr, SEQ_LAST, &new_roots, false); if (ret < 0) goto cleanup; if (qgroup_to_skip) { @@ -2570,7 +2570,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, num_bytes = found.offset; ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, - &roots); + &roots, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 24a62224b24b..a7f79254ecca 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1326,6 +1326,9 @@ write_data: cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); } /* @@ -1582,6 +1585,10 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + return -EIO; finish: @@ -2107,6 +2114,10 @@ cleanup: if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + return -EIO; } @@ -2231,12 +2242,18 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, ASSERT(!bio->bi_iter.bi_size); rbio->operation = BTRFS_RBIO_PARITY_SCRUB; - for (i = 0; i < rbio->real_stripes; i++) { + /* + * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted + * to the end position, so this search can start from the first parity + * stripe. + */ + for (i = rbio->nr_data; i < rbio->real_stripes; i++) { if (bbio->stripes[i].dev == scrub_dev) { rbio->scrubp = i; break; } } + ASSERT(i < rbio->real_stripes); /* Now we just support the sectorsize equals to page size */ ASSERT(fs_info->sectorsize == PAGE_SIZE); @@ -2454,6 +2471,9 @@ submit_write: cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2563,12 +2583,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int stripe; struct bio *bio; + bio_list_init(&bio_list); + ret = alloc_rbio_essential_pages(rbio); if (ret) goto cleanup; - bio_list_init(&bio_list); - atomic_set(&rbio->error, 0); /* * build a list of bios to read all the missing parts of this @@ -2636,6 +2656,10 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + return; finish: diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c new file mode 100644 index 000000000000..34878699d363 --- /dev/null +++ b/fs/btrfs/ref-verify.c @@ -0,0 +1,1031 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/stacktrace.h> +#include "ctree.h" +#include "disk-io.h" +#include "locking.h" +#include "delayed-ref.h" +#include "ref-verify.h" + +/* + * Used to keep track the roots and number of refs each root has for a given + * bytenr. This just tracks the number of direct references, no shared + * references. + */ +struct root_entry { + u64 root_objectid; + u64 num_refs; + struct rb_node node; +}; + +/* + * These are meant to represent what should exist in the extent tree, these can + * be used to verify the extent tree is consistent as these should all match + * what the extent tree says. + */ +struct ref_entry { + u64 root_objectid; + u64 parent; + u64 owner; + u64 offset; + u64 num_refs; + struct rb_node node; +}; + +#define MAX_TRACE 16 + +/* + * Whenever we add/remove a reference we record the action. The action maps + * back to the delayed ref action. We hold the ref we are changing in the + * action so we can account for the history properly, and we record the root we + * were called with since it could be different from ref_root. We also store + * stack traces because thats how I roll. + */ +struct ref_action { + int action; + u64 root; + struct ref_entry ref; + struct list_head list; + unsigned long trace[MAX_TRACE]; + unsigned int trace_len; +}; + +/* + * One of these for every block we reference, it holds the roots and references + * to it as well as all of the ref actions that have occured to it. We never + * free it until we unmount the file system in order to make sure re-allocations + * are happening properly. + */ +struct block_entry { + u64 bytenr; + u64 len; + u64 num_refs; + int metadata; + int from_disk; + struct rb_root roots; + struct rb_root refs; + struct rb_node node; + struct list_head actions; +}; + +static struct block_entry *insert_block_entry(struct rb_root *root, + struct block_entry *be) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct block_entry *entry; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct block_entry, node); + if (entry->bytenr > be->bytenr) + p = &(*p)->rb_left; + else if (entry->bytenr < be->bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&be->node, parent_node, p); + rb_insert_color(&be->node, root); + return NULL; +} + +static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n; + struct block_entry *entry = NULL; + + n = root->rb_node; + while (n) { + entry = rb_entry(n, struct block_entry, node); + if (entry->bytenr < bytenr) + n = n->rb_right; + else if (entry->bytenr > bytenr) + n = n->rb_left; + else + return entry; + } + return NULL; +} + +static struct root_entry *insert_root_entry(struct rb_root *root, + struct root_entry *re) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct root_entry *entry; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct root_entry, node); + if (entry->root_objectid > re->root_objectid) + p = &(*p)->rb_left; + else if (entry->root_objectid < re->root_objectid) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&re->node, parent_node, p); + rb_insert_color(&re->node, root); + return NULL; + +} + +static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) +{ + if (ref1->root_objectid < ref2->root_objectid) + return -1; + if (ref1->root_objectid > ref2->root_objectid) + return 1; + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + if (ref1->owner < ref2->owner) + return -1; + if (ref1->owner > ref2->owner) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + return 0; +} + +static struct ref_entry *insert_ref_entry(struct rb_root *root, + struct ref_entry *ref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct ref_entry *entry; + int cmp; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct ref_entry, node); + cmp = comp_refs(entry, ref); + if (cmp > 0) + p = &(*p)->rb_left; + else if (cmp < 0) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&ref->node, parent_node, p); + rb_insert_color(&ref->node, root); + return NULL; + +} + +static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) +{ + struct rb_node *n; + struct root_entry *entry = NULL; + + n = root->rb_node; + while (n) { + entry = rb_entry(n, struct root_entry, node); + if (entry->root_objectid < objectid) + n = n->rb_right; + else if (entry->root_objectid > objectid) + n = n->rb_left; + else + return entry; + } + return NULL; +} + +#ifdef CONFIG_STACKTRACE +static void __save_stack_trace(struct ref_action *ra) +{ + struct stack_trace stack_trace; + + stack_trace.max_entries = MAX_TRACE; + stack_trace.nr_entries = 0; + stack_trace.entries = ra->trace; + stack_trace.skip = 2; + save_stack_trace(&stack_trace); + ra->trace_len = stack_trace.nr_entries; +} + +static void __print_stack_trace(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + struct stack_trace trace; + + if (ra->trace_len == 0) { + btrfs_err(fs_info, " ref-verify: no stacktrace"); + return; + } + trace.nr_entries = ra->trace_len; + trace.entries = ra->trace; + print_stack_trace(&trace, 2); +} +#else +static void inline __save_stack_trace(struct ref_action *ra) +{ +} + +static void inline __print_stack_trace(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + btrfs_err(fs_info, " ref-verify: no stacktrace support"); +} +#endif + +static void free_block_entry(struct block_entry *be) +{ + struct root_entry *re; + struct ref_entry *ref; + struct ref_action *ra; + struct rb_node *n; + + while ((n = rb_first(&be->roots))) { + re = rb_entry(n, struct root_entry, node); + rb_erase(&re->node, &be->roots); + kfree(re); + } + + while((n = rb_first(&be->refs))) { + ref = rb_entry(n, struct ref_entry, node); + rb_erase(&ref->node, &be->refs); + kfree(ref); + } + + while (!list_empty(&be->actions)) { + ra = list_first_entry(&be->actions, struct ref_action, + list); + list_del(&ra->list); + kfree(ra); + } + kfree(be); +} + +static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 len, + u64 root_objectid) +{ + struct block_entry *be = NULL, *exist; + struct root_entry *re = NULL; + + re = kzalloc(sizeof(struct root_entry), GFP_KERNEL); + be = kzalloc(sizeof(struct block_entry), GFP_KERNEL); + if (!be || !re) { + kfree(re); + kfree(be); + return ERR_PTR(-ENOMEM); + } + be->bytenr = bytenr; + be->len = len; + + re->root_objectid = root_objectid; + re->num_refs = 0; + + spin_lock(&fs_info->ref_verify_lock); + exist = insert_block_entry(&fs_info->block_tree, be); + if (exist) { + if (root_objectid) { + struct root_entry *exist_re; + + exist_re = insert_root_entry(&exist->roots, re); + if (exist_re) + kfree(re); + } + kfree(be); + return exist; + } + + be->num_refs = 0; + be->metadata = 0; + be->from_disk = 0; + be->roots = RB_ROOT; + be->refs = RB_ROOT; + INIT_LIST_HEAD(&be->actions); + if (root_objectid) + insert_root_entry(&be->roots, re); + else + kfree(re); + return be; +} + +static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root, + u64 parent, u64 bytenr, int level) +{ + struct block_entry *be; + struct root_entry *re; + struct ref_entry *ref = NULL, *exist; + + ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL); + if (!ref) + return -ENOMEM; + + if (parent) + ref->root_objectid = 0; + else + ref->root_objectid = ref_root; + ref->parent = parent; + ref->owner = level; + ref->offset = 0; + ref->num_refs = 1; + + be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs++; + be->from_disk = 1; + be->metadata = 1; + + if (!parent) { + ASSERT(ref_root); + re = lookup_root_entry(&be->roots, ref_root); + ASSERT(re); + re->num_refs++; + } + exist = insert_ref_entry(&be->refs, ref); + if (exist) { + exist->num_refs++; + kfree(ref); + } + spin_unlock(&fs_info->ref_verify_lock); + + return 0; +} + +static int add_shared_data_ref(struct btrfs_fs_info *fs_info, + u64 parent, u32 num_refs, u64 bytenr, + u64 num_bytes) +{ + struct block_entry *be; + struct ref_entry *ref; + + ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); + if (!ref) + return -ENOMEM; + be = add_block_entry(fs_info, bytenr, num_bytes, 0); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs += num_refs; + + ref->parent = parent; + ref->num_refs = num_refs; + if (insert_ref_entry(&be->refs, ref)) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "existing shared ref when reading from disk?"); + kfree(ref); + return -EINVAL; + } + spin_unlock(&fs_info->ref_verify_lock); + return 0; +} + +static int add_extent_data_ref(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + struct btrfs_extent_data_ref *dref, + u64 bytenr, u64 num_bytes) +{ + struct block_entry *be; + struct ref_entry *ref; + struct root_entry *re; + u64 ref_root = btrfs_extent_data_ref_root(leaf, dref); + u64 owner = btrfs_extent_data_ref_objectid(leaf, dref); + u64 offset = btrfs_extent_data_ref_offset(leaf, dref); + u32 num_refs = btrfs_extent_data_ref_count(leaf, dref); + + ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); + if (!ref) + return -ENOMEM; + be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs += num_refs; + + ref->parent = 0; + ref->owner = owner; + ref->root_objectid = ref_root; + ref->offset = offset; + ref->num_refs = num_refs; + if (insert_ref_entry(&be->refs, ref)) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "existing ref when reading from disk?"); + kfree(ref); + return -EINVAL; + } + + re = lookup_root_entry(&be->roots, ref_root); + if (!re) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "missing root in new block entry?"); + return -EINVAL; + } + re->num_refs += num_refs; + spin_unlock(&fs_info->ref_verify_lock); + return 0; +} + +static int process_extent_item(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *key, + int slot, int *tree_block_level) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct extent_buffer *leaf = path->nodes[0]; + u32 item_size = btrfs_item_size_nr(leaf, slot); + unsigned long end, ptr; + u64 offset, flags, count; + int type, ret; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + if ((key->type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)(ei + 1); + *tree_block_level = btrfs_tree_block_level(leaf, info); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + if (key->type == BTRFS_METADATA_ITEM_KEY) + *tree_block_level = key->offset; + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + offset = btrfs_extent_inline_ref_offset(leaf, iref); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, offset, 0, key->objectid, + *tree_block_level); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, 0, offset, key->objectid, + *tree_block_level); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + ret = add_extent_data_ref(fs_info, leaf, dref, + key->objectid, key->offset); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sref); + ret = add_shared_data_ref(fs_info, offset, count, + key->objectid, key->offset); + break; + default: + btrfs_err(fs_info, "invalid key type in iref"); + ret = -EINVAL; + break; + } + if (ret) + break; + ptr += btrfs_extent_inline_ref_size(type); + } + return ret; +} + +static int process_leaf(struct btrfs_root *root, + struct btrfs_path *path, u64 *bytenr, u64 *num_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + u32 count; + int i = 0, tree_block_level = 0, ret; + struct btrfs_key key; + int nritems = btrfs_header_nritems(leaf); + + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + switch (key.type) { + case BTRFS_EXTENT_ITEM_KEY: + *num_bytes = key.offset; + case BTRFS_METADATA_ITEM_KEY: + *bytenr = key.objectid; + ret = process_extent_item(fs_info, path, &key, i, + &tree_block_level); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, key.offset, 0, + key.objectid, tree_block_level); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, 0, key.offset, + key.objectid, tree_block_level); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(leaf, i, + struct btrfs_extent_data_ref); + ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr, + *num_bytes); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(leaf, i, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sref); + ret = add_shared_data_ref(fs_info, key.offset, count, + *bytenr, *num_bytes); + break; + default: + break; + } + if (ret) + break; + } + return ret; +} + +/* Walk down to the leaf from the given level */ +static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, + int level, u64 *bytenr, u64 *num_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *eb; + u64 block_bytenr, gen; + int ret = 0; + + while (level >= 0) { + if (level) { + block_bytenr = btrfs_node_blockptr(path->nodes[level], + path->slots[level]); + gen = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + eb = read_tree_block(fs_info, block_bytenr, gen); + if (IS_ERR(eb)) + return PTR_ERR(eb); + if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->nodes[level-1] = eb; + path->slots[level-1] = 0; + path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING; + } else { + ret = process_leaf(root, path, bytenr, num_bytes); + if (ret) + break; + } + level--; + } + return ret; +} + +/* Walk up to the next node that needs to be processed */ +static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + int l; + + for (l = 0; l < BTRFS_MAX_LEVEL; l++) { + if (!path->nodes[l]) + continue; + if (l) { + path->slots[l]++; + if (path->slots[l] < + btrfs_header_nritems(path->nodes[l])) { + *level = l; + return 0; + } + } + btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]); + free_extent_buffer(path->nodes[l]); + path->nodes[l] = NULL; + path->slots[l] = 0; + path->locks[l] = 0; + } + + return 1; +} + +static void dump_ref_action(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + btrfs_err(fs_info, +" Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", + ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent, + ra->ref.owner, ra->ref.offset, ra->ref.num_refs); + __print_stack_trace(fs_info, ra); +} + +/* + * Dumps all the information from the block entry to printk, it's going to be + * awesome. + */ +static void dump_block_entry(struct btrfs_fs_info *fs_info, + struct block_entry *be) +{ + struct ref_entry *ref; + struct root_entry *re; + struct ref_action *ra; + struct rb_node *n; + + btrfs_err(fs_info, +"dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d", + be->bytenr, be->len, be->num_refs, be->metadata, + be->from_disk); + + for (n = rb_first(&be->refs); n; n = rb_next(n)) { + ref = rb_entry(n, struct ref_entry, node); + btrfs_err(fs_info, +" ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", + ref->root_objectid, ref->parent, ref->owner, + ref->offset, ref->num_refs); + } + + for (n = rb_first(&be->roots); n; n = rb_next(n)) { + re = rb_entry(n, struct root_entry, node); + btrfs_err(fs_info, " root entry %llu, num_refs %llu", + re->root_objectid, re->num_refs); + } + + list_for_each_entry(ra, &be->actions, list) + dump_ref_action(fs_info, ra); +} + +/* + * btrfs_ref_tree_mod: called when we modify a ref for a bytenr + * @root: the root we are making this modification from. + * @bytenr: the bytenr we are modifying. + * @num_bytes: number of bytes. + * @parent: the parent bytenr. + * @ref_root: the original root owner of the bytenr. + * @owner: level in the case of metadata, inode in the case of data. + * @offset: 0 for metadata, file offset for data. + * @action: the action that we are doing, this is the same as the delayed ref + * action. + * + * This will add an action item to the given bytenr and do sanity checks to make + * sure we haven't messed something up. If we are making a new allocation and + * this block entry has history we will delete all previous actions as long as + * our sanity checks pass as they are no longer needed. + */ +int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, u64 owner, u64 offset, + int action) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct ref_entry *ref = NULL, *exist; + struct ref_action *ra = NULL; + struct block_entry *be = NULL; + struct root_entry *re = NULL; + int ret = 0; + bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + + if (!btrfs_test_opt(root->fs_info, REF_VERIFY)) + return 0; + + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); + ra = kmalloc(sizeof(struct ref_action), GFP_NOFS); + if (!ra || !ref) { + kfree(ref); + kfree(ra); + ret = -ENOMEM; + goto out; + } + + if (parent) { + ref->parent = parent; + } else { + ref->root_objectid = ref_root; + ref->owner = owner; + ref->offset = offset; + } + ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1; + + memcpy(&ra->ref, ref, sizeof(struct ref_entry)); + /* + * Save the extra info from the delayed ref in the ref action to make it + * easier to figure out what is happening. The real ref's we add to the + * ref tree need to reflect what we save on disk so it matches any + * on-disk refs we pre-loaded. + */ + ra->ref.owner = owner; + ra->ref.offset = offset; + ra->ref.root_objectid = ref_root; + __save_stack_trace(ra); + + INIT_LIST_HEAD(&ra->list); + ra->action = action; + ra->root = root->objectid; + + /* + * This is an allocation, preallocate the block_entry in case we haven't + * used it before. + */ + ret = -EINVAL; + if (action == BTRFS_ADD_DELAYED_EXTENT) { + /* + * For subvol_create we'll just pass in whatever the parent root + * is and the new root objectid, so let's not treat the passed + * in root as if it really has a ref for this bytenr. + */ + be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root); + if (IS_ERR(be)) { + kfree(ra); + ret = PTR_ERR(be); + goto out; + } + be->num_refs++; + if (metadata) + be->metadata = 1; + + if (be->num_refs != 1) { + btrfs_err(fs_info, + "re-allocated a block that still has references to it!"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + goto out_unlock; + } + + while (!list_empty(&be->actions)) { + struct ref_action *tmp; + + tmp = list_first_entry(&be->actions, struct ref_action, + list); + list_del(&tmp->list); + kfree(tmp); + } + } else { + struct root_entry *tmp; + + if (!parent) { + re = kmalloc(sizeof(struct root_entry), GFP_NOFS); + if (!re) { + kfree(ref); + kfree(ra); + ret = -ENOMEM; + goto out; + } + /* + * This is the root that is modifying us, so it's the + * one we want to lookup below when we modify the + * re->num_refs. + */ + ref_root = root->objectid; + re->root_objectid = root->objectid; + re->num_refs = 0; + } + + spin_lock(&root->fs_info->ref_verify_lock); + be = lookup_block_entry(&root->fs_info->block_tree, bytenr); + if (!be) { + btrfs_err(fs_info, +"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!", + action, (unsigned long long)bytenr, + (unsigned long long)num_bytes); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + goto out_unlock; + } + + if (!parent) { + tmp = insert_root_entry(&be->roots, re); + if (tmp) { + kfree(re); + re = tmp; + } + } + } + + exist = insert_ref_entry(&be->refs, ref); + if (exist) { + if (action == BTRFS_DROP_DELAYED_REF) { + if (exist->num_refs == 0) { + btrfs_err(fs_info, +"dropping a ref for a existing root that doesn't have a ref on the block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + exist->num_refs--; + if (exist->num_refs == 0) { + rb_erase(&exist->node, &be->refs); + kfree(exist); + } + } else if (!be->metadata) { + exist->num_refs++; + } else { + btrfs_err(fs_info, +"attempting to add another ref for an existing ref on a tree block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + kfree(ref); + } else { + if (action == BTRFS_DROP_DELAYED_REF) { + btrfs_err(fs_info, +"dropping a ref for a root that doesn't have a ref on the block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + } + + if (!parent && !re) { + re = lookup_root_entry(&be->roots, ref_root); + if (!re) { + /* + * This shouldn't happen because we will add our re + * above when we lookup the be with !parent, but just in + * case catch this case so we don't panic because I + * didn't thik of some other corner case. + */ + btrfs_err(fs_info, "failed to find root %llu for %llu", + root->objectid, be->bytenr); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + } + if (action == BTRFS_DROP_DELAYED_REF) { + if (re) + re->num_refs--; + be->num_refs--; + } else if (action == BTRFS_ADD_DELAYED_REF) { + be->num_refs++; + if (re) + re->num_refs++; + } + list_add_tail(&ra->list, &be->actions); + ret = 0; +out_unlock: + spin_unlock(&root->fs_info->ref_verify_lock); +out: + if (ret) + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + return ret; +} + +/* Free up the ref cache */ +void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) +{ + struct block_entry *be; + struct rb_node *n; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return; + + spin_lock(&fs_info->ref_verify_lock); + while ((n = rb_first(&fs_info->block_tree))) { + be = rb_entry(n, struct block_entry, node); + rb_erase(&be->node, &fs_info->block_tree); + free_block_entry(be); + cond_resched_lock(&fs_info->ref_verify_lock); + } + spin_unlock(&fs_info->ref_verify_lock); +} + +void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, + u64 len) +{ + struct block_entry *be = NULL, *entry; + struct rb_node *n; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return; + + spin_lock(&fs_info->ref_verify_lock); + n = fs_info->block_tree.rb_node; + while (n) { + entry = rb_entry(n, struct block_entry, node); + if (entry->bytenr < start) { + n = n->rb_right; + } else if (entry->bytenr > start) { + n = n->rb_left; + } else { + be = entry; + break; + } + /* We want to get as close to start as possible */ + if (be == NULL || + (entry->bytenr < start && be->bytenr > start) || + (entry->bytenr < start && entry->bytenr > be->bytenr)) + be = entry; + } + + /* + * Could have an empty block group, maybe have something to check for + * this case to verify we were actually empty? + */ + if (!be) { + spin_unlock(&fs_info->ref_verify_lock); + return; + } + + n = &be->node; + while (n) { + be = rb_entry(n, struct block_entry, node); + n = rb_next(n); + if (be->bytenr < start && be->bytenr + be->len > start) { + btrfs_err(fs_info, + "block entry overlaps a block group [%llu,%llu]!", + start, len); + dump_block_entry(fs_info, be); + continue; + } + if (be->bytenr < start) + continue; + if (be->bytenr >= start + len) + break; + if (be->bytenr + be->len > start + len) { + btrfs_err(fs_info, + "block entry overlaps a block group [%llu,%llu]!", + start, len); + dump_block_entry(fs_info, be); + } + rb_erase(&be->node, &fs_info->block_tree); + free_block_entry(be); + } + spin_unlock(&fs_info->ref_verify_lock); +} + +/* Walk down all roots and build the ref tree, meant to be called at mount */ +int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct extent_buffer *eb; + u64 bytenr = 0, num_bytes = 0; + int ret, level; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + eb = btrfs_read_lock_root_node(fs_info->extent_root); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + level = btrfs_header_level(eb); + path->nodes[level] = eb; + path->slots[level] = 0; + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + while (1) { + /* + * We have to keep track of the bytenr/num_bytes we last hit + * because we could have run out of space for an inline ref, and + * would have had to added a ref key item which may appear on a + * different leaf from the original extent item. + */ + ret = walk_down_tree(fs_info->extent_root, path, level, + &bytenr, &num_bytes); + if (ret) + break; + ret = walk_up_tree(root, path, &level); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + } + if (ret) { + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + btrfs_free_ref_cache(fs_info); + } + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h new file mode 100644 index 000000000000..3bf02ce0e1e2 --- /dev/null +++ b/fs/btrfs/ref-verify.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __REF_VERIFY__ +#define __REF_VERIFY__ + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY +int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); +void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); +int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, u64 owner, u64 offset, + int action); +void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, + u64 len); + +static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->ref_verify_lock); + fs_info->block_tree = RB_ROOT; +} +#else +static inline int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) +{ + return 0; +} + +static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) +{ +} + +static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, + u64 owner, u64 offset, int action) +{ + return 0; +} + +static inline void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, + u64 start, u64 len) +{ +} + +static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) +{ +} + +#endif /* CONFIG_BTRFS_FS_REF_VERIFY */ +#endif /* _REF_VERIFY__ */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9841faef08ea..4cf2eb67eba6 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1742,7 +1742,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); - ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, + ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), key.objectid, key.offset); @@ -1751,7 +1751,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, break; } - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { @@ -1952,21 +1952,21 @@ again: path->slots[level], old_ptr_gen); btrfs_mark_buffer_dirty(path->nodes[level]); - ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr, + ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, src->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, + ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize, + ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, src->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize, + ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, 0); BUG_ON(ret); @@ -2808,7 +2808,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(upper->eb); - ret = btrfs_inc_extent_ref(trans, root->fs_info, + ret = btrfs_inc_extent_ref(trans, root, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), @@ -3246,6 +3246,8 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); ret = -EIO; goto out; } @@ -3275,6 +3277,7 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); index++; + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 95bcc3cce78f..3338407ef0f0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -226,10 +226,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *root; int err = 0; int ret; - bool can_recover = true; - - if (sb_rdonly(fs_info->sb)) - can_recover = false; path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e3f6c49e5c4d..b2f871d80982 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -231,7 +231,7 @@ struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; const char *errstr; - sector_t sector; + u64 physical; u64 logical; struct btrfs_device *dev; }; @@ -797,10 +797,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", +"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), - (unsigned long long)swarn->sector, + swarn->physical, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); @@ -810,10 +810,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, err: btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", + "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), - (unsigned long long)swarn->sector, + swarn->physical, root, inum, offset, ret); free_ipath(ipath); @@ -845,7 +845,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (!path) return; - swarn.sector = (sblock->pagev[0]->physical) >> 9; + swarn.physical = sblock->pagev[0]->physical; swarn.logical = sblock->pagev[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -868,10 +868,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) item_size, &ref_root, &ref_level); btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, sector %llu: metadata %s (level %d) in tree %llu", +"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, rcu_str_deref(dev->name), - (unsigned long long)swarn.sector, + swarn.physical, ref_level ? "node" : "leaf", ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); @@ -883,7 +883,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, - scrub_print_warning_inode, &swarn); + scrub_print_warning_inode, &swarn, false); } out: @@ -1047,7 +1047,7 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) * can be found. */ ret = iterate_inodes_from_logical(fixup->logical, fs_info, path, - scrub_fixup_readpage, fixup); + scrub_fixup_readpage, fixup, false); if (ret < 0) { uncorrectable = 1; goto out; @@ -4390,7 +4390,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) } ret = iterate_inodes_from_logical(logical, fs_info, path, - record_inode_for_nocow, nocow_ctx); + record_inode_for_nocow, nocow_ctx, false); if (ret != 0 && ret != -ENOENT) { btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d", diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8fd195cfe81b..c10e4c70f02d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -26,6 +26,7 @@ #include <linux/radix-tree.h> #include <linux/vmalloc.h> #include <linux/string.h> +#include <linux/compat.h> #include "send.h" #include "backref.h" @@ -992,7 +993,6 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, * path must point to the dir item when called. */ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *found_key, iterate_dir_item_t iterate, void *ctx) { int ret = 0; @@ -1271,12 +1271,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) */ if (ino >= bctx->cur_objectid) return 0; -#if 0 - if (ino > bctx->cur_objectid) - return 0; - if (offset + bctx->extent_len > bctx->cur_offset) - return 0; -#endif } bctx->found++; @@ -1429,7 +1423,7 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = 0; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, - backref_ctx); + backref_ctx, false); if (ret < 0) goto out; @@ -4106,8 +4100,8 @@ out: return ret; } -static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, - struct fs_path *name, void *ctx, struct list_head *refs) +static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, + void *ctx, struct list_head *refs) { int ret = 0; struct send_ctx *sctx = ctx; @@ -4143,8 +4137,7 @@ static int __record_new_ref(int num, u64 dir, int index, void *ctx) { struct send_ctx *sctx = ctx; - return record_ref(sctx->send_root, num, dir, index, name, - ctx, &sctx->new_refs); + return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs); } @@ -4153,8 +4146,8 @@ static int __record_deleted_ref(int num, u64 dir, int index, void *ctx) { struct send_ctx *sctx = ctx; - return record_ref(sctx->parent_root, num, dir, index, name, - ctx, &sctx->deleted_refs); + return record_ref(sctx->parent_root, dir, name, ctx, + &sctx->deleted_refs); } static int record_new_ref(struct send_ctx *sctx) @@ -4498,7 +4491,7 @@ static int process_new_xattr(struct send_ctx *sctx) int ret = 0; ret = iterate_dir_item(sctx->send_root, sctx->left_path, - sctx->cmp_key, __process_new_xattr, sctx); + __process_new_xattr, sctx); return ret; } @@ -4506,7 +4499,7 @@ static int process_new_xattr(struct send_ctx *sctx) static int process_deleted_xattr(struct send_ctx *sctx) { return iterate_dir_item(sctx->parent_root, sctx->right_path, - sctx->cmp_key, __process_deleted_xattr, sctx); + __process_deleted_xattr, sctx); } struct find_xattr_ctx { @@ -4551,7 +4544,7 @@ static int find_xattr(struct btrfs_root *root, ctx.found_data = NULL; ctx.found_data_len = 0; - ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); + ret = iterate_dir_item(root, path, __find_xattr, &ctx); if (ret < 0) return ret; @@ -4621,11 +4614,11 @@ static int process_changed_xattr(struct send_ctx *sctx) int ret = 0; ret = iterate_dir_item(sctx->send_root, sctx->left_path, - sctx->cmp_key, __process_changed_new_xattr, sctx); + __process_changed_new_xattr, sctx); if (ret < 0) goto out; ret = iterate_dir_item(sctx->parent_root, sctx->right_path, - sctx->cmp_key, __process_changed_deleted_xattr, sctx); + __process_changed_deleted_xattr, sctx); out: return ret; @@ -4675,8 +4668,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) goto out; } - ret = iterate_dir_item(root, path, &found_key, - __process_new_xattr, sctx); + ret = iterate_dir_item(root, path, __process_new_xattr, sctx); if (ret < 0) goto out; @@ -4723,16 +4715,27 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) /* initial readahead */ memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, inode->i_mapping); - page_cache_sync_readahead(inode->i_mapping, &sctx->ra, NULL, index, - last_index - index + 1); while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); + + page = find_lock_page(inode->i_mapping, index); if (!page) { - ret = -ENOMEM; - break; + page_cache_sync_readahead(inode->i_mapping, &sctx->ra, + NULL, index, last_index + 1 - index); + + page = find_or_create_page(inode->i_mapping, index, + GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (PageReadahead(page)) { + page_cache_async_readahead(inode->i_mapping, &sctx->ra, + NULL, page, index, last_index + 1 - index); } if (!PageUptodate(page)) { @@ -6162,9 +6165,7 @@ out: * Updates compare related fields in sctx and simply forwards to the actual * changed_xxx functions. */ -static int changed_cb(struct btrfs_root *left_root, - struct btrfs_root *right_root, - struct btrfs_path *left_path, +static int changed_cb(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, @@ -6246,8 +6247,8 @@ static int full_send_tree(struct send_ctx *sctx) slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); - ret = changed_cb(send_root, NULL, path, NULL, - &found_key, BTRFS_COMPARE_TREE_NEW, sctx); + ret = changed_cb(path, NULL, &found_key, + BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; @@ -6365,13 +6366,12 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) spin_unlock(&root->root_item_lock); } -long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) +long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) { int ret = 0; struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; - struct btrfs_ioctl_send_args *arg = NULL; struct btrfs_key key; struct send_ctx *sctx = NULL; u32 i; @@ -6407,13 +6407,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } - arg = memdup_user(arg_, sizeof(*arg)); - if (IS_ERR(arg)) { - ret = PTR_ERR(arg); - arg = NULL; - goto out; - } - /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside @@ -6654,7 +6647,6 @@ out: if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) btrfs_root_dec_send_in_progress(sctx->parent_root); - kfree(arg); kvfree(clone_sources_tmp); if (sctx) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 02e00166c4da..3aa4bc55754f 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -130,5 +130,5 @@ enum { #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) #ifdef __KERNEL__ -long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); +long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 35a128acfbd1..65af029559b5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -202,7 +202,6 @@ static struct ratelimit_state printk_limits[] = { void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { - struct super_block *sb = fs_info->sb; char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; va_list args; @@ -228,7 +227,8 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) vaf.va = &args; if (__ratelimit(ratelimit)) - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, + fs_info ? fs_info->sb->s_id : "<unknown>", &vaf); va_end(args); } @@ -292,7 +292,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, vaf.va = &args; errstr = btrfs_decode_error(errno); - if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) + if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", s_id, function, line, &vaf, errno, errstr); @@ -326,6 +326,9 @@ enum { #ifdef CONFIG_BTRFS_DEBUG Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + Opt_ref_verify, +#endif Opt_err, }; @@ -387,6 +390,9 @@ static const match_table_t tokens = { {Opt_fragment_metadata, "fragment=metadata"}, {Opt_fragment_all, "fragment=all"}, #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + {Opt_ref_verify, "ref_verify"}, +#endif {Opt_err, NULL}, }; @@ -502,6 +508,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, strncmp(args[0].from, "zlib", 4) == 0) { compress_type = "zlib"; info->compress_type = BTRFS_COMPRESS_ZLIB; + info->compress_level = + btrfs_compress_str2level(args[0].from); btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -549,9 +557,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, compress_force != saved_compress_force)) || (!btrfs_test_opt(info, COMPRESS) && no_compress == 1)) { - btrfs_info(info, "%s %s compression", + btrfs_info(info, "%s %s compression, level %d", (compress_force) ? "force" : "use", - compress_type); + compress_type, info->compress_level); } compress_force = false; break; @@ -825,6 +833,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); break; #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + case Opt_ref_verify: + btrfs_info(info, "doing ref verification"); + btrfs_set_opt(info->mount_opt, REF_VERIFY); + break; +#endif case Opt_err: btrfs_info(info, "unrecognized mount option '%s'", p); ret = -EINVAL; @@ -1135,7 +1149,7 @@ static int btrfs_fill_super(struct super_block *sb, #ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif - sb->s_flags |= MS_I_VERSION; + sb->s_flags |= SB_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; err = super_setup_bdi(sb); @@ -1205,8 +1219,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) * happens. The pending operations are delayed to the * next commit after thawing. */ - if (__sb_start_write(sb, SB_FREEZE_WRITE, false)) - __sb_end_write(sb, SB_FREEZE_WRITE); + if (sb_start_write_trylock(sb)) + sb_end_write(sb); else return 0; trans = btrfs_start_transaction(root, 0); @@ -1246,6 +1260,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); + if (info->compress_level) + seq_printf(seq, ":%d", info->compress_level); } if (btrfs_test_opt(info, NOSSD)) seq_puts(seq, ",nossd"); @@ -1305,6 +1321,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, FRAGMENT_METADATA)) seq_puts(seq, ",fragment=metadata"); #endif + if (btrfs_test_opt(info, REF_VERIFY)) + seq_puts(seq, ",ref_verify"); seq_printf(seq, ",subvolid=%llu", BTRFS_I(d_inode(dentry))->root->root_key.objectid); seq_puts(seq, ",subvol="); @@ -2112,7 +2130,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * succeed even if the Avail is zero. But this is better than the other * way around. */ - thresh = 4 * 1024 * 1024; + thresh = SZ_4M; if (!mixed && total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; @@ -2319,6 +2337,9 @@ static void btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY ", integrity-checker=on" #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + ", ref-verify=on" +#endif "\n", btrfs_crc32c_impl()); } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 883881b16c86..a28bba801264 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -247,7 +247,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj, struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); } -BTRFS_ATTR(global_rsv_size, global_rsv_size_show); +BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show); static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -256,15 +256,15 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); } -BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show); +BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); -BTRFS_RAID_ATTR(total_bytes, raid_bytes_show); -BTRFS_RAID_ATTR(used_bytes, raid_bytes_show); +BTRFS_ATTR(raid, total_bytes, raid_bytes_show); +BTRFS_ATTR(raid, used_bytes, raid_bytes_show); static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -277,7 +277,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, down_read(&sinfo->groups_sem); list_for_each_entry(block_group, &sinfo->block_groups[index], list) { - if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes)) + if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes)) val += block_group->key.offset; else val += btrfs_block_group_used(&block_group->item); @@ -287,8 +287,8 @@ static ssize_t raid_bytes_show(struct kobject *kobj, } static struct attribute *raid_attributes[] = { - BTRFS_RAID_ATTR_PTR(total_bytes), - BTRFS_RAID_ATTR_PTR(used_bytes), + BTRFS_ATTR_PTR(raid, total_bytes), + BTRFS_ATTR_PTR(raid, used_bytes), NULL }; @@ -311,7 +311,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ struct btrfs_space_info *sinfo = to_space_info(kobj); \ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ } \ -BTRFS_ATTR(field, btrfs_space_info_show_##field) +BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, struct kobj_attribute *a, @@ -331,19 +331,20 @@ SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); -BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); +BTRFS_ATTR(space_info, total_bytes_pinned, + btrfs_space_info_show_total_bytes_pinned); static struct attribute *space_info_attrs[] = { - BTRFS_ATTR_PTR(flags), - BTRFS_ATTR_PTR(total_bytes), - BTRFS_ATTR_PTR(bytes_used), - BTRFS_ATTR_PTR(bytes_pinned), - BTRFS_ATTR_PTR(bytes_reserved), - BTRFS_ATTR_PTR(bytes_may_use), - BTRFS_ATTR_PTR(bytes_readonly), - BTRFS_ATTR_PTR(disk_used), - BTRFS_ATTR_PTR(disk_total), - BTRFS_ATTR_PTR(total_bytes_pinned), + BTRFS_ATTR_PTR(space_info, flags), + BTRFS_ATTR_PTR(space_info, total_bytes), + BTRFS_ATTR_PTR(space_info, bytes_used), + BTRFS_ATTR_PTR(space_info, bytes_pinned), + BTRFS_ATTR_PTR(space_info, bytes_reserved), + BTRFS_ATTR_PTR(space_info, bytes_may_use), + BTRFS_ATTR_PTR(space_info, bytes_readonly), + BTRFS_ATTR_PTR(space_info, disk_used), + BTRFS_ATTR_PTR(space_info, disk_total), + BTRFS_ATTR_PTR(space_info, total_bytes_pinned), NULL, }; @@ -361,8 +362,8 @@ struct kobj_type space_info_ktype = { }; static const struct attribute *allocation_attrs[] = { - BTRFS_ATTR_PTR(global_rsv_reserved), - BTRFS_ATTR_PTR(global_rsv_size), + BTRFS_ATTR_PTR(allocation, global_rsv_reserved), + BTRFS_ATTR_PTR(allocation, global_rsv_size), NULL, }; @@ -415,7 +416,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj, return len; } -BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); +BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store); static ssize_t btrfs_nodesize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -425,7 +426,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); } -BTRFS_ATTR(nodesize, btrfs_nodesize_show); +BTRFS_ATTR(, nodesize, btrfs_nodesize_show); static ssize_t btrfs_sectorsize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -436,7 +437,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, fs_info->super_copy->sectorsize); } -BTRFS_ATTR(sectorsize, btrfs_sectorsize_show); +BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -447,7 +448,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, fs_info->super_copy->sectorsize); } -BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); +BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); static ssize_t quota_override_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -487,14 +488,14 @@ static ssize_t quota_override_store(struct kobject *kobj, return len; } -BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); +BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); static const struct attribute *btrfs_attrs[] = { - BTRFS_ATTR_PTR(label), - BTRFS_ATTR_PTR(nodesize), - BTRFS_ATTR_PTR(sectorsize), - BTRFS_ATTR_PTR(clone_alignment), - BTRFS_ATTR_PTR(quota_override), + BTRFS_ATTR_PTR(, label), + BTRFS_ATTR_PTR(, nodesize), + BTRFS_ATTR_PTR(, sectorsize), + BTRFS_ATTR_PTR(, clone_alignment), + BTRFS_ATTR_PTR(, quota_override), NULL, }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index d7da1a4c2f6c..80457f31c29f 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BTRFS_SYSFS_H_ #define _BTRFS_SYSFS_H_ @@ -20,21 +21,16 @@ enum btrfs_feature_set { .store = _store, \ } -#define BTRFS_ATTR_RW(_name, _show, _store) \ - static struct kobj_attribute btrfs_attr_##_name = \ +#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) -#define BTRFS_ATTR(_name, _show) \ - static struct kobj_attribute btrfs_attr_##_name = \ +#define BTRFS_ATTR(_prefix, _name, _show) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) -#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) - -#define BTRFS_RAID_ATTR(_name, _show) \ - static struct kobj_attribute btrfs_raid_attr_##_name = \ - __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) - -#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) +#define BTRFS_ATTR_PTR(_prefix, _name) \ + (&btrfs_attr_##_prefix##_##_name.attr) struct btrfs_feature_attr { @@ -43,15 +39,16 @@ struct btrfs_feature_attr { u64 feature_bit; }; -#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \ -static struct btrfs_feature_attr btrfs_attr_##_name = { \ +#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ btrfs_feature_attr_show, \ btrfs_feature_attr_store), \ .feature_set = _feature_set, \ - .feature_bit = _prefix ##_## _feature_bit, \ + .feature_bit = _feature_prefix ##_## _feature_bit, \ } -#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr) +#define BTRFS_FEAT_ATTR_PTR(_name) \ + (&btrfs_attr_features_##_name.kobj_attr.attr) #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 1458bb0ea124..8444a018cca2 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -500,7 +500,8 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, path = btrfs_alloc_path(); if (!path) { test_msg("Couldn't allocate path\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } ret = add_block_group_free_space(&trans, root->fs_info, cache); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 8c91d03cc82d..f797642c013d 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, 4096 * 1024, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -968,7 +968,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, NULL, 0); if (ret) { @@ -983,7 +982,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, NULL, 0); @@ -1003,7 +1001,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | - EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); @@ -1017,7 +1015,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, @@ -1035,12 +1032,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] - * - * I'm artificially adding 2 to outstanding_extents because in the - * buffered IO case we'd add things up as we go, but I don't feel like - * doing that here, this isn't the interesting case we want to test. */ - BTRFS_I(inode)->outstanding_extents += 2; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, @@ -1059,7 +1051,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); @@ -1079,7 +1070,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); @@ -1096,7 +1087,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) * Refill the hole again just for good measure, because I thought it * might fail and I'd rather satisfy my paranoia at this point. */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); @@ -1114,7 +1104,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); @@ -1131,7 +1121,7 @@ out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 0f4ce970d195..90204b166643 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -240,7 +240,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -252,7 +253,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -275,7 +277,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, old_roots = NULL; new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -286,7 +289,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) return -EINVAL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -337,7 +341,8 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -349,7 +354,8 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -370,7 +376,8 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -382,7 +389,8 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -409,7 +417,8 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -421,7 +430,8 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f615d59b0489..5a8c2649af2f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -797,8 +797,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (fs_info->global_block_rsv.space_info->full && - btrfs_check_space_for_delayed_refs(trans, fs_info)) + if (btrfs_check_space_for_delayed_refs(trans, fs_info)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -950,6 +949,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; + atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { bool wait_writeback = false; @@ -985,6 +985,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, cond_resched(); start = end + 1; } + atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers); return werr; } @@ -1915,8 +1916,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { + /* + * We use writeback_inodes_sb here because if we used + * btrfs_start_delalloc_roots we would deadlock with fs freeze. + * Currently are holding the fs freeze lock, if we do an async flush + * we'll do btrfs_join_transaction() and deadlock because we need to + * wait for the fs freeze lock. Using the direct flushing we benefit + * from already being in a transaction and our join_transaction doesn't + * have to re-take the fs freeze lock. + */ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - return btrfs_start_delalloc_roots(fs_info, 1, -1); + writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); return 0; } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c new file mode 100644 index 000000000000..114fc5f0ecc5 --- /dev/null +++ b/fs/btrfs/tree-checker.c @@ -0,0 +1,425 @@ +/* + * Copyright (C) Qu Wenruo 2017. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. + */ + +/* + * The module is used to catch unexpected/corrupted tree block data. + * Such behavior can be caused either by a fuzzed image or bugs. + * + * The objective is to do leaf/node validation checks when tree block is read + * from disk, and check *every* possible member, so other code won't + * need to checking them again. + * + * Due to the potential and unwanted damage, every checker needs to be + * carefully reviewed otherwise so it does not prevent mount of valid images. + */ + +#include "ctree.h" +#include "tree-checker.h" +#include "disk-io.h" +#include "compression.h" + +/* + * Error message should follow the following format: + * corrupt <type>: <identifier>, <reason>[, <bad_value>] + * + * @type: leaf or node + * @identifier: the necessary info to locate the leaf/node. + * It's recommened to decode key.objecitd/offset if it's + * meaningful. + * @reason: describe the error + * @bad_value: optional, it's recommened to output bad value and its + * expected value (range). + * + * Since comma is used to separate the components, only space is allowed + * inside each component. + */ + +/* + * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. + * Allows callers to customize the output. + */ +__printf(4, 5) +static void generic_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root->objectid, btrfs_header_bytenr(eb), slot, &vaf); + va_end(args); +} + +/* + * Customized reporter for extent data item, since its key objectid and + * offset has its own meaning. + */ +__printf(4, 5) +static void file_extent_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, + btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf); + va_end(args); +} + +/* + * Return 0 if the btrfs_file_extent_##name is aligned to @alignment + * Else return 1 + */ +#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment) \ +({ \ + if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \ + file_extent_err((root), (leaf), (slot), \ + "invalid %s for file extent, have %llu, should be aligned to %u", \ + (#name), btrfs_file_extent_##name((leaf), (fi)), \ + (alignment)); \ + (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \ +}) + +static int check_extent_data_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_file_extent_item *fi; + u32 sectorsize = root->fs_info->sectorsize; + u32 item_size = btrfs_item_size_nr(leaf, slot); + + if (!IS_ALIGNED(key->offset, sectorsize)) { + file_extent_err(root, leaf, slot, +"unaligned file_offset for file extent, have %llu should be aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { + file_extent_err(root, leaf, slot, + "invalid type for file extent, have %u expect range [0, %u]", + btrfs_file_extent_type(leaf, fi), + BTRFS_FILE_EXTENT_TYPES); + return -EUCLEAN; + } + + /* + * Support for new compression/encrption must introduce incompat flag, + * and must be caught in open_ctree(). + */ + if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { + file_extent_err(root, leaf, slot, + "invalid compression for file extent, have %u expect range [0, %u]", + btrfs_file_extent_compression(leaf, fi), + BTRFS_COMPRESS_TYPES); + return -EUCLEAN; + } + if (btrfs_file_extent_encryption(leaf, fi)) { + file_extent_err(root, leaf, slot, + "invalid encryption for file extent, have %u expect 0", + btrfs_file_extent_encryption(leaf, fi)); + return -EUCLEAN; + } + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + /* Inline extent must have 0 as key offset */ + if (key->offset) { + file_extent_err(root, leaf, slot, + "invalid file_offset for inline file extent, have %llu expect 0", + key->offset); + return -EUCLEAN; + } + + /* Compressed inline extent has no on-disk size, skip it */ + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE) + return 0; + + /* Uncompressed inline extent size must match item size */ + if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)) { + file_extent_err(root, leaf, slot, + "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", + item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)); + return -EUCLEAN; + } + return 0; + } + + /* Regular or preallocated extent has fixed item size */ + if (item_size != sizeof(*fi)) { + file_extent_err(root, leaf, slot, + "invalid item size for reg/prealloc file extent, have %u expect %zu", + item_size, sizeof(*fi)); + return -EUCLEAN; + } + if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize)) + return -EUCLEAN; + return 0; +} + +static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + u32 sectorsize = root->fs_info->sectorsize; + u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy); + + if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { + generic_err(root, leaf, slot, + "invalid key objectid for csum item, have %llu expect %llu", + key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); + return -EUCLEAN; + } + if (!IS_ALIGNED(key->offset, sectorsize)) { + generic_err(root, leaf, slot, + "unaligned key offset for csum item, have %llu should be aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { + generic_err(root, leaf, slot, + "unaligned item size for csum item, have %u should be aligned to %u", + btrfs_item_size_nr(leaf, slot), csumsize); + return -EUCLEAN; + } + return 0; +} + +/* + * Common point to switch the item-specific validation. + */ +static int check_leaf_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + int ret = 0; + + switch (key->type) { + case BTRFS_EXTENT_DATA_KEY: + ret = check_extent_data_item(root, leaf, key, slot); + break; + case BTRFS_EXTENT_CSUM_KEY: + ret = check_csum_item(root, leaf, key, slot); + break; + } + return ret; +} + +int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + /* No valid key type is 0, so all key should be larger than this key */ + struct btrfs_key prev_key = {0, 0, 0}; + struct btrfs_key key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + /* + * Extent buffers from a relocation tree have a owner field that + * corresponds to the subvolume tree they are based on. So just from an + * extent buffer alone we can not find out what is the id of the + * corresponding subvolume tree, so we can not figure out if the extent + * buffer corresponds to the root of the relocation tree or not. So + * skip this check for relocation trees. + */ + if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { + struct btrfs_root *check_root; + + key.objectid = btrfs_header_owner(leaf); + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + check_root = btrfs_get_fs_root(fs_info, &key, false); + /* + * The only reason we also check NULL here is that during + * open_ctree() some roots has not yet been set up. + */ + if (!IS_ERR_OR_NULL(check_root)) { + struct extent_buffer *eb; + + eb = btrfs_root_node(check_root); + /* if leaf is the root, then it's fine */ + if (leaf != eb) { + generic_err(check_root, leaf, 0, + "invalid nritems, have %u should not be 0 for non-root leaf", + nritems); + free_extent_buffer(eb); + return -EUCLEAN; + } + free_extent_buffer(eb); + } + return 0; + } + + if (nritems == 0) + return 0; + + /* + * Check the following things to make sure this is a good leaf, and + * leaf users won't need to bother with similar sanity checks: + * + * 1) key ordering + * 2) item offset and size + * No overlap, no hole, all inside the leaf. + * 3) item content + * If possible, do comprehensive sanity check. + * NOTE: All checks must only rely on the item data itself. + */ + for (slot = 0; slot < nritems; slot++) { + u32 item_end_expected; + int ret; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* Make sure the keys are in the right order */ + if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { + generic_err(root, leaf, slot, + "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", + prev_key.objectid, prev_key.type, + prev_key.offset, key.objectid, key.type, + key.offset); + return -EUCLEAN; + } + + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (slot == 0) + item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); + else + item_end_expected = btrfs_item_offset_nr(leaf, + slot - 1); + if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { + generic_err(root, leaf, slot, + "unexpected item end, have %u expect %u", + btrfs_item_end_nr(leaf, slot), + item_end_expected); + return -EUCLEAN; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just in case all the items are consistent to each other, but + * all point outside of the leaf. + */ + if (btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(fs_info)) { + generic_err(root, leaf, slot, + "slot end outside of leaf, have %u expect range [0, %u]", + btrfs_item_end_nr(leaf, slot), + BTRFS_LEAF_DATA_SIZE(fs_info)); + return -EUCLEAN; + } + + /* Also check if the item pointer overlaps with btrfs item. */ + if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > + btrfs_item_ptr_offset(leaf, slot)) { + generic_err(root, leaf, slot, + "slot overlaps with its data, item end %lu data start %lu", + btrfs_item_nr_offset(slot) + + sizeof(struct btrfs_item), + btrfs_item_ptr_offset(leaf, slot)); + return -EUCLEAN; + } + + /* Check if the item size and content meet other criteria */ + ret = check_leaf_item(root, leaf, &key, slot); + if (ret < 0) + return ret; + + prev_key.objectid = key.objectid; + prev_key.type = key.type; + prev_key.offset = key.offset; + } + + return 0; +} + +int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) +{ + unsigned long nr = btrfs_header_nritems(node); + struct btrfs_key key, next_key; + int slot; + u64 bytenr; + int ret = 0; + + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { + btrfs_crit(root->fs_info, +"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", + root->objectid, node->start, + nr == 0 ? "small" : "large", nr, + BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)); + return -EUCLEAN; + } + + for (slot = 0; slot < nr - 1; slot++) { + bytenr = btrfs_node_blockptr(node, slot); + btrfs_node_key_to_cpu(node, &key, slot); + btrfs_node_key_to_cpu(node, &next_key, slot + 1); + + if (!bytenr) { + generic_err(root, node, slot, + "invalid NULL node pointer"); + ret = -EUCLEAN; + goto out; + } + if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) { + generic_err(root, node, slot, + "unaligned pointer, have %llu should be aligned to %u", + bytenr, root->fs_info->sectorsize); + ret = -EUCLEAN; + goto out; + } + + if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { + generic_err(root, node, slot, + "bad key order, current (%llu %u %llu) next (%llu %u %llu)", + key.objectid, key.type, key.offset, + next_key.objectid, next_key.type, + next_key.offset); + ret = -EUCLEAN; + goto out; + } + } +out: + return ret; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h new file mode 100644 index 000000000000..96c486e95d70 --- /dev/null +++ b/fs/btrfs/tree-checker.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) Qu Wenruo 2017. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. + */ + +#ifndef __BTRFS_TREE_CHECKER__ +#define __BTRFS_TREE_CHECKER__ + +#include "ctree.h" +#include "extent_io.h" + +int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node); + +#endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c800d067fcbf..aa7c71cff575 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -717,7 +717,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); if (ret == 0) { - ret = btrfs_inc_extent_ref(trans, fs_info, + ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, key->objectid, offset); @@ -2699,34 +2699,36 @@ static void wait_log_commit(struct btrfs_root *root, int transid) * so we know that if ours is more than 2 older than the * current transaction, we're done */ - do { + for (;;) { prepare_to_wait(&root->log_commit_wait[index], &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&root->log_mutex); - if (root->log_transid_committed < transid && - atomic_read(&root->log_commit[index])) - schedule(); + if (!(root->log_transid_committed < transid && + atomic_read(&root->log_commit[index]))) + break; - finish_wait(&root->log_commit_wait[index], &wait); + mutex_unlock(&root->log_mutex); + schedule(); mutex_lock(&root->log_mutex); - } while (root->log_transid_committed < transid && - atomic_read(&root->log_commit[index])); + } + finish_wait(&root->log_commit_wait[index], &wait); } static void wait_for_writer(struct btrfs_root *root) { DEFINE_WAIT(wait); - while (atomic_read(&root->log_writers)) { - prepare_to_wait(&root->log_writer_wait, - &wait, TASK_UNINTERRUPTIBLE); + for (;;) { + prepare_to_wait(&root->log_writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (!atomic_read(&root->log_writers)) + break; + mutex_unlock(&root->log_mutex); - if (atomic_read(&root->log_writers)) - schedule(); - finish_wait(&root->log_writer_wait, &wait); + schedule(); mutex_lock(&root->log_mutex); } + finish_wait(&root->log_writer_wait, &wait); } static inline void btrfs_remove_log_ctx(struct btrfs_root *root, @@ -4645,7 +4647,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; - struct extent_buffer *src = NULL; LIST_HEAD(logged_list); u64 last_extent = 0; int err = 0; @@ -4888,7 +4889,6 @@ again: goto next_slot; } - src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; goto next_slot; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b39737568c22..f1ecb938ba4d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -360,7 +360,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) int again = 0; unsigned long num_run; unsigned long batch_run = 0; - unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; int sync_pending = 0; @@ -375,8 +374,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) blk_start_plug(&plug); bdi = device->bdev->bd_bdi; - limit = btrfs_async_submit_limit(fs_info); - limit = limit * 2 / 3; loop: spin_lock(&device->io_lock); @@ -443,13 +440,6 @@ loop_lock: pending = pending->bi_next; cur->bi_next = NULL; - /* - * atomic_dec_return implies a barrier for waitqueue_active - */ - if (atomic_dec_return(&fs_info->nr_async_bios) < limit && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); - BUG_ON(atomic_read(&cur->__bi_cnt) == 0); /* @@ -517,12 +507,6 @@ loop_lock: &device->work); goto done; } - /* unplug every 64 requests just for good measure */ - if (batch_run % 64 == 0) { - blk_finish_plug(&plug); - blk_start_plug(&plug); - sync_pending = 0; - } } cond_resched(); @@ -547,7 +531,7 @@ static void pending_bios_fn(struct btrfs_work *work) } -void btrfs_free_stale_device(struct btrfs_device *cur_dev) +static void btrfs_free_stale_device(struct btrfs_device *cur_dev) { struct btrfs_fs_devices *fs_devs; struct btrfs_device *dev; @@ -1068,14 +1052,15 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } -void btrfs_release_disk_super(struct page *page) +static void btrfs_release_disk_super(struct page *page) { kunmap(page); put_page(page); } -int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, - struct page **page, struct btrfs_super_block **disk_super) +static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, + struct page **page, + struct btrfs_super_block **disk_super) { void *p; pgoff_t index; @@ -1817,8 +1802,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, return 0; } -struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, - struct btrfs_device *device) +static struct btrfs_device * btrfs_find_next_active_device( + struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) { struct btrfs_device *next_device; @@ -2031,19 +2016,20 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, } btrfs_close_bdev(srcdev); - call_rcu(&srcdev->rcu, free_device); - /* - * unless fs_devices is seed fs, num_devices shouldn't go - * zero - */ - BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); - /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { struct btrfs_fs_devices *tmp_fs_devices; + /* + * On a mounted FS, num_devices can't be zero unless it's a + * seed. In case of a seed device being replaced, the replace + * target added to the sprout FS, so there will be no more + * device left under the seed FS. + */ + ASSERT(fs_devices->seeding); + tmp_fs_devices = fs_info->fs_devices; while (tmp_fs_devices) { if (tmp_fs_devices->seed == fs_devices) { @@ -2323,6 +2309,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 tmp; int seeding_dev = 0; int ret = 0; + bool unlocked = false; if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) return -EROFS; @@ -2399,7 +2386,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; ret = btrfs_prepare_sprout(fs_info); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + goto error_trans; + } } device->fs_devices = fs_info->fs_devices; @@ -2445,14 +2435,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } } ret = btrfs_add_device(trans, fs_info, device); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } if (seeding_dev) { @@ -2461,7 +2451,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path ret = btrfs_finish_sprout(trans, fs_info); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } /* Sprouting would change fsid of the mounted root, @@ -2479,6 +2469,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); + unlocked = true; if (ret) /* transaction commit */ return ret; @@ -2491,7 +2482,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (IS_ERR(trans)) { if (PTR_ERR(trans) == -ENOENT) return 0; - return PTR_ERR(trans); + ret = PTR_ERR(trans); + trans = NULL; + goto error_sysfs; } ret = btrfs_commit_transaction(trans); } @@ -2500,14 +2493,18 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path update_dev_time(device_path); return ret; +error_sysfs: + btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); error_trans: - btrfs_end_transaction(trans); + if (seeding_dev) + sb->s_flags |= MS_RDONLY; + if (trans) + btrfs_end_transaction(trans); rcu_string_free(device->name); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev) { + if (seeding_dev && !unlocked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } @@ -4813,16 +4810,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em_tree = &info->mapping_tree.map_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - if (!ret) { - list_add_tail(&em->list, &trans->transaction->pending_chunks); - refcount_inc(&em->refs); - } - write_unlock(&em_tree->lock); if (ret) { + write_unlock(&em_tree->lock); free_extent_map(em); goto error; } + list_add_tail(&em->list, &trans->transaction->pending_chunks); + refcount_inc(&em->refs); + write_unlock(&em_tree->lock); + ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); if (ret) goto error_del_extent; @@ -5695,10 +5692,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) + if (!need_full_stripe(op)) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5711,7 +5708,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { + if (need_full_stripe(op)) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5725,7 +5722,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; @@ -5740,9 +5737,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || - mirror_num > 1)) { + if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, stripe_len * nr_data_stripes(map)); @@ -5769,9 +5764,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != BTRFS_MAP_WRITE && - op != BTRFS_MAP_GET_READ_MIRRORS) && - mirror_num <= 1) + if (!need_full_stripe(op) && mirror_num <= 1) mirror_num = 1; } } else { @@ -6033,7 +6026,7 @@ static void btrfs_end_bio(struct bio *bio) * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_status = 0; + bio->bi_status = BLK_STS_OK; } btrfs_end_bbio(bbio, bio); @@ -6069,13 +6062,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, return; } - /* - * nr_async_bios allows us to reliably return congestion to the - * higher layers. Otherwise, the async bio makes it appear we have - * made progress against dirty pages when we've really just put it - * on a queue for later - */ - atomic_inc(&fs_info->nr_async_bios); WARN_ON(bio->bi_next); bio->bi_next = NULL; @@ -6144,7 +6130,10 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_status = BLK_STS_IOERR; + if (atomic_read(&bbio->error) > bbio->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; btrfs_end_bbio(bbio, bio); } } @@ -6249,7 +6238,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, device = btrfs_alloc_device(NULL, &devid, dev_uuid); if (IS_ERR(device)) - return NULL; + return device; list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; @@ -6377,6 +6366,17 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, return 0; } +static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid, bool error) +{ + if (error) + btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", + devid, uuid); + else + btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", + devid, uuid); +} + static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) @@ -6447,18 +6447,21 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); - btrfs_report_missing_device(fs_info, devid, uuid); - return -EIO; + btrfs_report_missing_device(fs_info, devid, uuid, true); + return -ENOENT; } if (!map->stripes[i].dev) { map->stripes[i].dev = add_missing_dev(fs_info->fs_devices, devid, uuid); - if (!map->stripes[i].dev) { + if (IS_ERR(map->stripes[i].dev)) { free_extent_map(em); - return -EIO; + btrfs_err(fs_info, + "failed to init missing dev %llu: %ld", + devid, PTR_ERR(map->stripes[i].dev)); + return PTR_ERR(map->stripes[i].dev); } - btrfs_report_missing_device(fs_info, devid, uuid); + btrfs_report_missing_device(fs_info, devid, uuid, false); } map->stripes[i].dev->in_fs_metadata = 1; } @@ -6577,19 +6580,28 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { - btrfs_report_missing_device(fs_info, devid, dev_uuid); - return -EIO; + btrfs_report_missing_device(fs_info, devid, + dev_uuid, true); + return -ENOENT; } device = add_missing_dev(fs_devices, devid, dev_uuid); - if (!device) - return -ENOMEM; - btrfs_report_missing_device(fs_info, devid, dev_uuid); + if (IS_ERR(device)) { + btrfs_err(fs_info, + "failed to add missing dev %llu: %ld", + devid, PTR_ERR(device)); + return PTR_ERR(device); + } + btrfs_report_missing_device(fs_info, devid, dev_uuid, false); } else { if (!device->bdev) { - btrfs_report_missing_device(fs_info, devid, dev_uuid); - if (!btrfs_test_opt(fs_info, DEGRADED)) - return -EIO; + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, + devid, dev_uuid, true); + return -ENOENT; + } + btrfs_report_missing_device(fs_info, devid, + dev_uuid, false); } if(!device->bdev && !device->missing) { @@ -6756,12 +6768,6 @@ out_short_read: return -EIO; } -void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid) -{ - btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid); -} - /* * Check if all chunks in the fs are OK for read-write degraded mount * diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6108fdfec67f..ff15208344a7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -542,7 +542,5 @@ void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info); -void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid); #endif diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c248f9286366..2b52950dc2c6 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -37,6 +37,7 @@ struct workspace { z_stream strm; char *buf; struct list_head list; + int level; }; static void zlib_free_workspace(struct list_head *ws) @@ -96,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws, *total_out = 0; *total_in = 0; - if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) { + if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) { pr_warn("BTRFS: deflateInit failed\n"); ret = -EIO; goto out; @@ -402,10 +403,22 @@ next: return ret; } +static void zlib_set_level(struct list_head *ws, unsigned int type) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + unsigned level = (type & 0xF0) >> 4; + + if (level > 9) + level = 9; + + workspace->level = level > 0 ? level : 3; +} + const struct btrfs_compress_op btrfs_zlib_compress = { .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, .decompress_bio = zlib_decompress_bio, .decompress = zlib_decompress, + .set_level = zlib_set_level, }; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 607ce47b483a..17f2dd8fddb8 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -423,10 +423,15 @@ finish: return ret; } +static void zstd_set_level(struct list_head *ws, unsigned int type) +{ +} + const struct btrfs_compress_op btrfs_zstd_compress = { .alloc_workspace = zstd_alloc_workspace, .free_workspace = zstd_free_workspace, .compress_pages = zstd_compress_pages, .decompress_bio = zstd_decompress_bio, .decompress = zstd_decompress, + .set_level = zstd_set_level, }; diff --git a/fs/buffer.c b/fs/buffer.c index 170df856bdb9..49b7e9bdcd1d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1692,7 +1692,8 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode * BUG_ON(!PageLocked(page)); if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); + create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits), + b_state); return page_buffers(page); } @@ -1978,8 +1979,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, case IOMAP_MAPPED: if (offset >= i_size_read(inode)) set_buffer_new(bh); - bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) + - ((offset - iomap->offset) >> inode->i_blkbits); + bh->b_blocknr = (iomap->addr + offset - iomap->offset) >> + inode->i_blkbits; set_buffer_mapped(bh); break; } diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile index 32cbab0ffce3..891dedda5905 100644 --- a/fs/cachefiles/Makefile +++ b/fs/cachefiles/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for caching in a mounted filesystem # diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 85a4230b9bff..174f5709e508 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for CEPH filesystem. # diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b3e3edc09d80..4d622654bfbc 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 157fe59fbabe..ff5d32cf9578 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> @@ -1991,6 +1992,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) retry: spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { + spin_unlock(&ci->i_ceph_lock); dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); goto out; } @@ -2008,8 +2010,10 @@ retry: mutex_lock(&session->s_mutex); goto retry; } - if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) + if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) { + spin_unlock(&ci->i_ceph_lock); goto out; + } flushing = __mark_caps_flushing(inode, session, true, &flush_tid, &oldest_flush_tid); diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c index bdce8b1fbd06..6f67d5b884a0 100644 --- a/fs/ceph/ceph_frag.c +++ b/fs/ceph/ceph_frag.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Ceph 'frag' type */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index d635496ea189..644def813754 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/device.h> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 019c2036d36f..8a5266699b67 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/spinlock.h> diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 7df550c13d7f..3c59ad180ef0 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/exportfs.h> diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 65a6fa12c857..5c17125f45c7 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/module.h> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 373dab5173ca..f2550a076edc 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/module.h> diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 4c9c72f26eb9..851aa69ec8f0 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/in.h> diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index c77028afb1e1..51f7f1d39a94 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef FS_CEPH_IOCTL_H #define FS_CEPH_IOCTL_H diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 8cd63e8123d8..e7cce412f2cf 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/file.h> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f23c820daaed..0687ab3c3267 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 636d6b2ec49c..837ac4b087a0 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_MDS_CLIENT_H #define _FS_CEPH_MDS_CLIENT_H diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 33ced4c22732..44e53abeb32a 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/bug.h> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 7fc0b850c352..8a2ca41e4b97 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/sort.h> diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 913dea163d5c..4a79f3632260 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Ceph fs string constants */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 279a2f401cf5..3e27a28aa44a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_SUPER_H #define _FS_CEPH_SUPER_H diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 3542b2c364cf..e1c4e0b12b4c 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/ceph/pagelist.h> diff --git a/fs/char_dev.c b/fs/char_dev.c index ebcc8fb3fa66..a65e4a56318c 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/char_dev.c * diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index f7243617316c..d5b2e12b5d02 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -5,9 +5,14 @@ config CIFS select CRYPTO select CRYPTO_MD4 select CRYPTO_MD5 + select CRYPTO_SHA256 + select CRYPTO_CMAC select CRYPTO_HMAC select CRYPTO_ARC4 + select CRYPTO_AEAD2 + select CRYPTO_CCM select CRYPTO_ECB + select CRYPTO_AES select CRYPTO_DES help This is the client VFS module for the SMB3 family of NAS protocols, diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 5e853a395b92..7134f182720b 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for Linux CIFS VFS client # diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index de5b2e1fcce5..e185b2853eab 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -661,7 +661,9 @@ struct TCP_Server_Info { #endif unsigned int max_read; unsigned int max_write; - __u8 preauth_hash[512]; +#ifdef CONFIG_CIFS_SMB311 + __u8 preauth_sha_hash[64]; /* save initital negprot hash */ +#endif /* 3.1.1 */ struct delayed_work reconnect; /* reconnect workqueue job */ struct mutex reconnect_mutex; /* prevent simultaneous reconnects */ unsigned long echo_interval; @@ -849,7 +851,9 @@ struct cifs_ses { __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; - __u8 preauth_hash[512]; +#ifdef CONFIG_CIFS_SMB311 + __u8 preauth_sha_hash[64]; +#endif /* 3.1.1 */ }; static inline bool diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index e702d48bd023..81ba6e0d88d8 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -204,7 +204,8 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon) struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); int i; - if (unlikely(direntry->d_name.len > + if (unlikely(tcon->fsAttrInfo.MaxPathNameComponentLength && + direntry->d_name.len > le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength))) return -ENAMETOOLONG; @@ -520,7 +521,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, rc = check_name(direntry, tcon); if (rc) - goto out_free_xid; + goto out; server = tcon->ses->server; diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 7ca9808a0daa..62c88dfed57b 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_DATATYPE_MISALIGNMENT, -EIO, "STATUS_DATATYPE_MISALIGNMENT"}, {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"}, {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"}, - {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"}, + {STATUS_BUFFER_OVERFLOW, -E2BIG, "STATUS_BUFFER_OVERFLOW"}, {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"}, {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"}, {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 0dafdbae1f8c..e06740436b92 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -522,6 +522,7 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; struct smb2_file_full_ea_info *smb2_data; + int ea_buf_size = SMB2_MIN_EA_BUF; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) @@ -541,14 +542,32 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, return rc; } - smb2_data = kzalloc(SMB2_MAX_EA_BUF, GFP_KERNEL); - if (smb2_data == NULL) { - SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); - return -ENOMEM; + while (1) { + smb2_data = kzalloc(ea_buf_size, GFP_KERNEL); + if (smb2_data == NULL) { + SMB2_close(xid, tcon, fid.persistent_fid, + fid.volatile_fid); + return -ENOMEM; + } + + rc = SMB2_query_eas(xid, tcon, fid.persistent_fid, + fid.volatile_fid, + ea_buf_size, smb2_data); + + if (rc != -E2BIG) + break; + + kfree(smb2_data); + ea_buf_size <<= 1; + + if (ea_buf_size > SMB2_MAX_EA_BUF) { + cifs_dbg(VFS, "EA size is too large\n"); + SMB2_close(xid, tcon, fid.persistent_fid, + fid.volatile_fid); + return -ENOMEM; + } } - rc = SMB2_query_eas(xid, tcon, fid.persistent_fid, fid.volatile_fid, - smb2_data); SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); if (!rc) @@ -2068,22 +2087,6 @@ init_sg(struct smb_rqst *rqst, u8 *sign) return sg; } -struct cifs_crypt_result { - int err; - struct completion completion; -}; - -static void cifs_crypt_complete(struct crypto_async_request *req, int err) -{ - struct cifs_crypt_result *res = req->data; - - if (err == -EINPROGRESS) - return; - - res->err = err; - complete(&res->completion); -} - static int smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) { @@ -2124,12 +2127,10 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) struct aead_request *req; char *iv; unsigned int iv_len; - struct cifs_crypt_result result = {0, }; + DECLARE_CRYPTO_WAIT(wait); struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); - init_completion(&result.completion); - rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key); if (rc) { cifs_dbg(VFS, "%s: Could not get %scryption key\n", __func__, @@ -2189,14 +2190,10 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) aead_request_set_ad(req, assoc_data_len); aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - cifs_crypt_complete, &result); + crypto_req_done, &wait); - rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req); - - if (rc == -EINPROGRESS || rc == -EBUSY) { - wait_for_completion(&result.completion); - rc = result.err; - } + rc = crypto_wait_req(enc ? crypto_aead_encrypt(req) + : crypto_aead_decrypt(req), &wait); if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 6f0e6343c15e..5331631386a2 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -648,7 +648,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) { int rc = 0; struct validate_negotiate_info_req vneg_inbuf; - struct validate_negotiate_info_rsp *pneg_rsp; + struct validate_negotiate_info_rsp *pneg_rsp = NULL; u32 rsplen; u32 inbuflen; /* max of 4 dialects */ @@ -727,8 +727,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) rsplen); /* relax check since Mac returns max bufsize allowed on ioctl */ - if (rsplen > CIFSMaxBufSize) - return -EIO; + if ((rsplen > CIFSMaxBufSize) + || (rsplen < sizeof(struct validate_negotiate_info_rsp))) + goto err_rsp_free; } /* check validate negotiate info response matches what we got earlier */ @@ -747,10 +748,13 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) /* validate negotiate successful */ cifs_dbg(FYI, "validate negotiate info successful\n"); + kfree(pneg_rsp); return 0; vneg_out: cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n"); +err_rsp_free: + kfree(pneg_rsp); return -EIO; } @@ -1255,7 +1259,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, struct smb2_tree_connect_req *req; struct smb2_tree_connect_rsp *rsp = NULL; struct kvec iov[2]; - struct kvec rsp_iov; + struct kvec rsp_iov = { NULL, 0 }; int rc = 0; int resp_buftype; int unc_path_len; @@ -1372,7 +1376,7 @@ tcon_exit: return rc; tcon_error_exit: - if (rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { + if (rsp && rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); } goto tcon_exit; @@ -1975,6 +1979,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, } else iov[0].iov_len = get_rfc1002_length(req) + 4; + /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ + if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) + req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED; rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); @@ -2191,9 +2198,13 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; req->AdditionalInformation = cpu_to_le32(additional_info); - /* 4 for rfc1002 length field and 1 for Buffer */ - req->InputBufferOffset = - cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4); + + /* + * We do not use the input buffer (do not send extra byte) + */ + req->InputBufferOffset = 0; + inc_rfc1001_len(req, -1); + req->OutputBufferLength = cpu_to_le32(output_len); iov[0].iov_base = (char *)req; @@ -2233,12 +2244,12 @@ qinf_exit: } int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, - struct smb2_file_full_ea_info *data) + u64 persistent_fid, u64 volatile_fid, + int ea_buf_size, struct smb2_file_full_ea_info *data) { return query_info(xid, tcon, persistent_fid, volatile_fid, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0, - SMB2_MAX_EA_BUF, + ea_buf_size, sizeof(struct smb2_file_full_ea_info), (void **)&data, NULL); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 6c9653a130c8..c2ec934be968 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -832,7 +832,7 @@ struct smb2_flush_rsp { /* Channel field for read and write: exactly one of following flags can be set*/ #define SMB2_CHANNEL_NONE 0x00000000 #define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */ -#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */ +#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000002 /* SMB3.02 or later */ /* SMB2 read request without RFC1001 length at the beginning */ struct smb2_read_plain_req { @@ -1178,7 +1178,8 @@ struct smb2_file_link_info { /* encoding of request for level 11 */ char FileName[0]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ -#define SMB2_MAX_EA_BUF 2048 +#define SMB2_MIN_EA_BUF 2048 +#define SMB2_MAX_EA_BUF 65536 struct smb2_file_full_ea_info { /* encoding of response for level 15 */ __le32 next_entry_offset; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 003217099ef3..e9ab5227e7a8 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -134,6 +134,7 @@ extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, + int ea_buf_size, struct smb2_file_full_ea_info *data); extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 67367cf1f8cd..99493946e2f9 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -390,6 +390,7 @@ generate_smb30signingkey(struct cifs_ses *ses) return generate_smb3signingkey(ses, &triplet); } +#ifdef CONFIG_CIFS_SMB311 int generate_smb311signingkey(struct cifs_ses *ses) @@ -398,25 +399,26 @@ generate_smb311signingkey(struct cifs_ses *ses) struct derivation *d; d = &triplet.signing; - d->label.iov_base = "SMB2AESCMAC"; - d->label.iov_len = 12; - d->context.iov_base = "SmbSign"; - d->context.iov_len = 8; + d->label.iov_base = "SMBSigningKey"; + d->label.iov_len = 14; + d->context.iov_base = ses->preauth_sha_hash; + d->context.iov_len = 64; d = &triplet.encryption; - d->label.iov_base = "SMB2AESCCM"; - d->label.iov_len = 11; - d->context.iov_base = "ServerIn "; - d->context.iov_len = 10; + d->label.iov_base = "SMBC2SCipherKey"; + d->label.iov_len = 16; + d->context.iov_base = ses->preauth_sha_hash; + d->context.iov_len = 64; d = &triplet.decryption; - d->label.iov_base = "SMB2AESCCM"; - d->label.iov_len = 11; - d->context.iov_base = "ServerOut"; - d->context.iov_len = 10; + d->label.iov_base = "SMBS2CCipherKey"; + d->label.iov_len = 16; + d->context.iov_base = ses->preauth_sha_hash; + d->context.iov_len = 64; return generate_smb3signingkey(ses, &triplet); } +#endif /* 311 */ int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 5bb630a769e0..201fc08a8b4f 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Cache operations for Coda. * For Linux 2.1: (C) 1997 Carnegie Mellon University diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index f13e09057c6b..845b5a66952a 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* cnode related routines for the coda kernel code (C) 1996 Peter Braam */ diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h index c910b5eb1ceb..c9f7a77c013e 100644 --- a/fs/coda/coda_cache.h +++ b/fs/coda/coda_cache.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Coda filesystem -- Linux Minicache * * Copyright (C) 1989 - 1997 Carnegie Mellon University diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h index c64075213218..d702ba1a2bf9 100644 --- a/fs/coda/coda_fs_i.h +++ b/fs/coda/coda_fs_i.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * coda_fs_i.h * diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h index 381c993b1427..bb0b3e0ed6c2 100644 --- a/fs/coda/coda_int.h +++ b/fs/coda/coda_int.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _CODA_INT_ #define _CODA_INT_ diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index f1714cfb589c..ca599df0dcb1 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Inode operations for Coda filesystem * Original version: (C) 1996 P. Braam and M. Callahan diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index d3c361883c28..126155cadfa9 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Coda File System, Linux Kernel module * diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 274ab5586dd0..00876ddadb43 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Directory operations for Coda filesystem diff --git a/fs/coda/file.c b/fs/coda/file.c index 363402fcb3ed..1cbc1f2298ee 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * File operations for Coda. * Original version: (C) 1996 Peter Braam diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 6058df380cc0..6f0a6a4d5faa 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Super block/filesystem wide operations * diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index b0b9cda41928..e0c17b7dccce 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Pioctl operations for Coda. * Original version: (C) 1996 Peter Braam diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index 03736e20d720..202297d156df 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Symlink inode operations for Coda filesystem * Original version: (C) 1996 P. Braam and M. Callahan diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index 34218a8a28cd..0301d45000a8 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Sysctl operations for Coda filesystem * Original version: (C) 1996 P. Braam and M. Callahan diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index e82357c89979..a37f003530d7 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Mostly platform independent upcall operations to Venus: * -- upcalls diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index d27b326d96f4..bd5d91e119ca 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ioctl32.c: Conversion between 32bit and 64bit native ioctls. * diff --git a/fs/coredump.c b/fs/coredump.c index 0eec03696707..52c63d6c9143 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c index ec4f1d4fdad0..975d98fc26b5 100644 --- a/fs/cramfs/uncompress.c +++ b/fs/cramfs/uncompress.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * uncompress.c * diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index 9f6607f17b53..cb496989a6b6 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o -fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 483784d5eb73..0d5e6a569d58 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This contains encryption functions for per-file encryption. * diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index c7835df7e7b8..732a786cce9d 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -126,21 +126,6 @@ struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) } EXPORT_SYMBOL(fscrypt_get_ctx); -/** - * page_crypt_complete() - completion callback for page crypto - * @req: The asynchronous cipher request context - * @res: The result of the cipher operation - */ -static void page_crypt_complete(struct crypto_async_request *req, int res) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, struct page *dest_page, unsigned int len, @@ -151,7 +136,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u8 padding[FS_IV_SIZE - sizeof(__le64)]; } iv; struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist dst, src; struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; @@ -179,7 +164,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - page_crypt_complete, &ecr); + crypto_req_done, &wait); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, len, offs); @@ -187,14 +172,9 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, sg_set_page(&src, src_page, len, offs); skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) - res = crypto_skcipher_decrypt(req); + res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); else - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { printk_ratelimited(KERN_ERR @@ -340,7 +320,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; dir = dget_parent(dentry); - if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { + if (!IS_ENCRYPTED(d_inode(dir))) { dput(dir); return 0; } @@ -410,11 +390,8 @@ int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - /* - * No need to allocate a bounce page pool if there already is one or - * this FS won't use it. - */ - if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) + /* No need to allocate a bounce page pool if this FS won't use it. */ + if (cop_flags & FS_CFLG_OWN_PAGES) return 0; mutex_lock(&fscrypt_init_mutex); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index ad9f814fdead..305541bcd108 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This contains functions for filename crypto management * @@ -15,21 +16,6 @@ #include "fscrypt_private.h" /** - * fname_crypt_complete() - completion callback for filename crypto - * @req: The asynchronous cipher request context - * @res: The result of the cipher operation - */ -static void fname_crypt_complete(struct crypto_async_request *req, int res) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - -/** * fname_encrypt() - encrypt a filename * * The caller must have allocated sufficient memory for the @oname string. @@ -40,7 +26,7 @@ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; @@ -76,17 +62,12 @@ static int fname_encrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fname_crypt_complete, &ecr); + crypto_req_done, &wait); sg_init_one(&sg, oname->name, cryptlen); skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); /* Do the encryption */ - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - /* Request is being completed asynchronously; wait for it */ - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR @@ -110,7 +91,7 @@ static int fname_decrypt(struct inode *inode, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; @@ -131,7 +112,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fname_crypt_complete, &ecr); + crypto_req_done, &wait); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -140,11 +121,7 @@ static int fname_decrypt(struct inode *inode, sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_skcipher_decrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR @@ -382,8 +359,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; - if (!dir->i_sb->s_cop->is_encrypted(dir) || - fscrypt_is_dot_dotdot(iname)) { + if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index a1d5021c31ef..c0b4f5597e1a 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt_private.h * @@ -11,7 +12,8 @@ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H -#include <linux/fscrypt_supp.h> +#define __FS_HAS_ENCRYPTION 1 +#include <linux/fscrypt.h> #include <crypto/hash.h> /* Encryption parameters */ @@ -69,16 +71,6 @@ typedef enum { #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 #define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 -struct fscrypt_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_FS_COMPLETION_RESULT(ecr) \ - struct fscrypt_completion_result ecr = { \ - COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } - - /* crypto.c */ extern int fscrypt_initialize(unsigned int cop_flags); extern struct workqueue_struct *fscrypt_read_workqueue; diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c new file mode 100644 index 000000000000..9f5fb2eb9cf7 --- /dev/null +++ b/fs/crypto/hooks.c @@ -0,0 +1,112 @@ +/* + * fs/crypto/hooks.c + * + * Encryption hooks for higher-level filesystem operations. + */ + +#include <linux/ratelimit.h> +#include "fscrypt_private.h" + +/** + * fscrypt_file_open - prepare to open a possibly-encrypted regular file + * @inode: the inode being opened + * @filp: the struct file being set up + * + * Currently, an encrypted regular file can only be opened if its encryption key + * is available; access to the raw encrypted contents is not supported. + * Therefore, we first set up the inode's encryption key (if not already done) + * and return an error if it's unavailable. + * + * We also verify that if the parent directory (from the path via which the file + * is being opened) is encrypted, then the inode being opened uses the same + * encryption policy. This is needed as part of the enforcement that all files + * in an encrypted directory tree use the same encryption policy, as a + * protection against certain types of offline attacks. Note that this check is + * needed even when opening an *unencrypted* file, since it's forbidden to have + * an unencrypted file in an encrypted directory. + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + */ +int fscrypt_file_open(struct inode *inode, struct file *filp) +{ + int err; + struct dentry *dir; + + err = fscrypt_require_key(inode); + if (err) + return err; + + dir = dget_parent(file_dentry(filp)); + if (IS_ENCRYPTED(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu", + d_inode(dir)->i_ino, inode->i_ino); + err = -EPERM; + } + dput(dir); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_file_open); + +int __fscrypt_prepare_link(struct inode *inode, struct inode *dir) +{ + int err; + + err = fscrypt_require_key(dir); + if (err) + return err; + + if (!fscrypt_has_permitted_context(dir, inode)) + return -EPERM; + + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); + +int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + int err; + + err = fscrypt_require_key(old_dir); + if (err) + return err; + + err = fscrypt_require_key(new_dir); + if (err) + return err; + + if (old_dir != new_dir) { + if (IS_ENCRYPTED(new_dir) && + !fscrypt_has_permitted_context(new_dir, + d_inode(old_dentry))) + return -EPERM; + + if ((flags & RENAME_EXCHANGE) && + IS_ENCRYPTED(old_dir) && + !fscrypt_has_permitted_context(old_dir, + d_inode(new_dentry))) + return -EPERM; + } + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); + +int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry) +{ + int err = fscrypt_get_encryption_info(dir); + + if (err) + return err; + + if (fscrypt_has_encryption_key(dir)) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); + } + + d_set_d_op(dentry, &fscrypt_d_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 018c588c7ac3..5e6e846f5a24 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * key management facility for FS encryption support. * @@ -17,17 +18,6 @@ static struct crypto_shash *essiv_hash_tfm; -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - /** * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. @@ -42,7 +32,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], { int res = 0; struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); @@ -59,7 +49,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); + crypto_req_done, &wait); res = crypto_skcipher_setkey(tfm, deriving_key, FS_AES_128_ECB_KEY_SIZE); if (res < 0) @@ -69,11 +59,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], sg_init_one(&dst_sg, derived_raw_key, source_key->size); skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, NULL); - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); out: skcipher_request_free(req); crypto_free_skcipher(tfm); @@ -109,6 +95,11 @@ static int validate_user_key(struct fscrypt_info *crypt_info, goto out; } ukp = user_key_payload_locked(keyring_key); + if (!ukp) { + /* key was revoked before we acquired its semaphore */ + res = -EKEYREVOKED; + goto out; + } if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; goto out; @@ -268,7 +259,7 @@ int fscrypt_get_encryption_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (!fscrypt_dummy_context_enabled(inode) || - inode->i_sb->s_cop->is_encrypted(inode)) + IS_ENCRYPTED(inode)) return res; /* Fake up a context for an unencrypted directory */ memset(&ctx, 0, sizeof(ctx)); @@ -368,7 +359,7 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) struct fscrypt_info *prev; if (ci == NULL) - ci = ACCESS_ONCE(inode->i_crypt_info); + ci = READ_ONCE(inode->i_crypt_info); if (ci == NULL) return; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ce07a86200f3..c6d431a5cce9 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Encryption policy functions for per-file encryption support. * @@ -109,7 +110,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->is_encrypted(inode)) + if (!IS_ENCRYPTED(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); @@ -166,11 +167,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 1; /* No restrictions if the parent directory is unencrypted */ - if (!cops->is_encrypted(parent)) + if (!IS_ENCRYPTED(parent)) return 1; /* Encrypted directories must not contain unencrypted files */ - if (!cops->is_encrypted(child)) + if (!IS_ENCRYPTED(child)) return 0; /* @@ -938,7 +938,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) { - return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); + return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; } static loff_t diff --git a/fs/dcache.c b/fs/dcache.c index f90141387f01..bcc9f6981569 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c { /* * Be careful about RCU walk racing with rename: - * use 'lockless_dereference' to fetch the name pointer. + * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The @@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ - const unsigned char *cs = lockless_dereference(dentry->d_name.name); + const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } @@ -630,7 +630,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry) rcu_read_lock(); spin_unlock(&dentry->d_lock); again: - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); spin_lock(&parent->d_lock); /* * We can't blindly lock dentry until we are sure @@ -721,7 +721,7 @@ static inline bool fast_dput(struct dentry *dentry) * around with a zero refcount. */ smp_rmb(); - d_flags = ACCESS_ONCE(dentry->d_flags); + d_flags = READ_ONCE(dentry->d_flags); d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; /* Nothing to do? Dropping the reference was all we needed? */ @@ -850,11 +850,11 @@ struct dentry *dget_parent(struct dentry *dentry) * locking. */ rcu_read_lock(); - ret = ACCESS_ONCE(dentry->d_parent); + ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { - if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + if (likely(ret == READ_ONCE(dentry->d_parent))) return ret; dput(ret); } @@ -3040,7 +3040,7 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) * @buflen: allocated length of the buffer * @name: name string and length qstr structure * - * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to + * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. * The length cannot be trusted, we need to copy it byte-by-byte until @@ -3054,8 +3054,8 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) */ static int prepend_name(char **buffer, int *buflen, const struct qstr *name) { - const char *dname = ACCESS_ONCE(name->name); - u32 dlen = ACCESS_ONCE(name->len); + const char *dname = READ_ONCE(name->name); + u32 dlen = READ_ONCE(name->len); char *p; smp_read_barrier_depends(); @@ -3120,7 +3120,7 @@ restart: struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { - struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); + struct mount *parent = READ_ONCE(mnt->mnt_parent); /* Escaped? */ if (dentry != vfsmnt->mnt_root) { bptr = *buffer; @@ -3130,7 +3130,7 @@ restart: } /* Global root? */ if (mnt != parent) { - dentry = ACCESS_ONCE(mnt->mnt_mountpoint); + dentry = READ_ONCE(mnt->mnt_mountpoint); mnt = parent; vfsmnt = &mnt->mnt; continue; diff --git a/fs/direct-io.c b/fs/direct-io.c index 96415c65bbdc..98fe1325da9d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -45,6 +45,12 @@ #define DIO_PAGES 64 /* + * Flags for dio_complete() + */ +#define DIO_COMPLETE_ASYNC 0x01 /* This is async IO */ +#define DIO_COMPLETE_INVALIDATE 0x02 /* Can invalidate pages */ + +/* * This code generally works in units of "dio_blocks". A dio_block is * somewhere between the hard sector size and the filesystem block size. it * is determined on a per-invocation basis. When talking to the filesystem @@ -225,7 +231,7 @@ static inline struct page *dio_get_page(struct dio *dio, * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) { loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; @@ -259,14 +265,27 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if (ret == 0) ret = transferred; + if (dio->end_io) { + // XXX: ki_pos?? + err = dio->end_io(dio->iocb, offset, ret, dio->private); + if (err) + ret = err; + } + /* * Try again to invalidate clean pages which might have been cached by * non-direct readahead, or faulted in by get_user_pages() if the source * of the write was an mmap'ed region of the file we're writing. Either * one is a pretty crazy thing to do, so we don't support it 100%. If * this invalidation fails, tough, the write still worked... + * + * And this page cache invalidation has to be after dio->end_io(), as + * some filesystems convert unwritten extents to real allocations in + * end_io() when necessary, otherwise a racing buffer read would cache + * zeros from unwritten extents. */ - if (ret > 0 && dio->op == REQ_OP_WRITE && + if (flags & DIO_COMPLETE_INVALIDATE && + ret > 0 && dio->op == REQ_OP_WRITE && dio->inode->i_mapping->nrpages) { err = invalidate_inode_pages2_range(dio->inode->i_mapping, offset >> PAGE_SHIFT, @@ -274,18 +293,10 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) WARN_ON_ONCE(err); } - if (dio->end_io) { - - // XXX: ki_pos?? - err = dio->end_io(dio->iocb, offset, ret, dio->private); - if (err) - ret = err; - } - if (!(dio->flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(dio->inode); - if (is_async) { + if (flags & DIO_COMPLETE_ASYNC) { /* * generic_write_sync expects ki_pos to have been updated * already, but the submission path only does this for @@ -306,7 +317,7 @@ static void dio_aio_complete_work(struct work_struct *work) { struct dio *dio = container_of(work, struct dio, complete_work); - dio_complete(dio, 0, true); + dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE); } static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); @@ -348,7 +359,7 @@ static void dio_bio_end_aio(struct bio *bio) queue_work(dio->inode->i_sb->s_dio_done_wq, &dio->complete_work); } else { - dio_complete(dio, 0, true); + dio_complete(dio, 0, DIO_COMPLETE_ASYNC); } } } @@ -1141,7 +1152,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { - unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; @@ -1360,7 +1371,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio_await_completion(dio); if (drop_refcount(dio) == 0) { - retval = dio_complete(dio, retval, false); + retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE); } else BUG_ON(retval != -EIOCBQUEUED); diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index ca1c9124c8ce..3545fdafc6fb 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DLM) += dlm.o dlm-y := ast.o \ config.o \ diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 07fed838d8fd..562fa8c3edff 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -181,6 +181,8 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, spin_lock(&dlm_cb_seq_spin); new_seq = ++dlm_cb_seq; + if (!dlm_cb_seq) + new_seq = ++dlm_cb_seq; spin_unlock(&dlm_cb_seq_spin); if (lkb->lkb_flags & DLM_IFL_USER) { diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index d4aaddec1b16..cc91963683de 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1003,7 +1003,6 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, if (r->res_master_nodeid == our_nodeid) { log_error(ls, "from_master %d our_master", from_nodeid); dlm_dump_rsb(r); - dlm_send_rcom_lookup_dump(r, from_nodeid); goto out_found; } @@ -2465,14 +2464,12 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) { lkb->lkb_grmode = DLM_LOCK_NL; lkb->lkb_sbflags |= DLM_SBF_DEMOTED; - } else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { - if (err) - *err = -EDEADLK; - else { - log_print("can_be_granted deadlock %x now %d", - lkb->lkb_id, now); - dlm_dump_rsb(r); - } + } else if (err) { + *err = -EDEADLK; + } else { + log_print("can_be_granted deadlock %x now %d", + lkb->lkb_id, now); + dlm_dump_rsb(r); } goto out; } @@ -2501,13 +2498,6 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, return rv; } -/* FIXME: I don't think that can_be_granted() can/will demote or find deadlock - for locks pending on the convert list. Once verified (watch for these - log_prints), we should be able to just call _can_be_granted() and not - bother with the demote/deadlk cases here (and there's no easy way to deal - with a deadlk here, we'd have to generate something like grant_lock with - the deadlk error.) */ - /* Returns the highest requested mode of all blocked conversions; sets cw if there's a blocked conversion to DLM_LOCK_CW. */ @@ -2545,9 +2535,22 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, } if (deadlk) { - log_print("WARN: pending deadlock %x node %d %s", - lkb->lkb_id, lkb->lkb_nodeid, r->res_name); - dlm_dump_rsb(r); + /* + * If DLM_LKB_NODLKWT flag is set and conversion + * deadlock is detected, we request blocking AST and + * down (or cancel) conversion. + */ + if (lkb->lkb_exflags & DLM_LKF_NODLCKWT) { + if (lkb->lkb_highbast < lkb->lkb_rqmode) { + queue_bast(r, lkb, lkb->lkb_rqmode); + lkb->lkb_highbast = lkb->lkb_rqmode; + } + } else { + log_print("WARN: pending deadlock %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, + r->res_name); + dlm_dump_rsb(r); + } continue; } @@ -3123,7 +3126,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) deadlock, so we leave it on the granted queue and return EDEADLK in the ast for the convert. */ - if (deadlk) { + if (deadlk && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { /* it's left on the granted queue */ revert_lock(r, lkb); queue_cast(r, lkb, -EDEADLK); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4813d0e0cd9b..05707850f93a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -107,11 +107,11 @@ struct connection { unsigned long flags; #define CF_READ_PENDING 1 #define CF_WRITE_PENDING 2 -#define CF_CONNECT_PENDING 3 #define CF_INIT_PENDING 4 #define CF_IS_OTHERCON 5 #define CF_CLOSE 6 #define CF_APP_LIMITED 7 +#define CF_CLOSING 8 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int (*rx_action) (struct connection *); /* What to do when active */ @@ -124,10 +124,6 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ - void (*orig_error_report)(struct sock *); - void (*orig_data_ready)(struct sock *); - void (*orig_state_change)(struct sock *); - void (*orig_write_space)(struct sock *); }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -150,6 +146,13 @@ struct dlm_node_addr { struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; +static struct listen_sock_callbacks { + void (*sk_error_report)(struct sock *); + void (*sk_data_ready)(struct sock *); + void (*sk_state_change)(struct sock *); + void (*sk_write_space)(struct sock *); +} listen_sock; + static LIST_HEAD(dlm_node_addrs); static DEFINE_SPINLOCK(dlm_node_addrs_spin); @@ -408,17 +411,23 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) /* Data available on socket or listen socket received a connect */ static void lowcomms_data_ready(struct sock *sk) { - struct connection *con = sock2con(sk); + struct connection *con; + + read_lock_bh(&sk->sk_callback_lock); + con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) queue_work(recv_workqueue, &con->rwork); + read_unlock_bh(&sk->sk_callback_lock); } static void lowcomms_write_space(struct sock *sk) { - struct connection *con = sock2con(sk); + struct connection *con; + read_lock_bh(&sk->sk_callback_lock); + con = sock2con(sk); if (!con) - return; + goto out; clear_bit(SOCK_NOSPACE, &con->sock->flags); @@ -427,16 +436,17 @@ static void lowcomms_write_space(struct sock *sk) clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); } - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) - queue_work(send_workqueue, &con->swork); + queue_work(send_workqueue, &con->swork); +out: + read_unlock_bh(&sk->sk_callback_lock); } static inline void lowcomms_connect_sock(struct connection *con) { if (test_bit(CF_CLOSE, &con->flags)) return; - if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) - queue_work(send_workqueue, &con->swork); + queue_work(send_workqueue, &con->swork); + cond_resched(); } static void lowcomms_state_change(struct sock *sk) @@ -480,7 +490,7 @@ static void lowcomms_error_report(struct sock *sk) if (con == NULL) goto out; - orig_report = con->orig_error_report; + orig_report = listen_sock.sk_error_report; if (con->sock == NULL || kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) { printk_ratelimited(KERN_ERR "dlm: node %d: socket error " @@ -517,27 +527,31 @@ out: } /* Note: sk_callback_lock must be locked before calling this function. */ -static void save_callbacks(struct connection *con, struct sock *sk) +static void save_listen_callbacks(struct socket *sock) { - con->orig_data_ready = sk->sk_data_ready; - con->orig_state_change = sk->sk_state_change; - con->orig_write_space = sk->sk_write_space; - con->orig_error_report = sk->sk_error_report; + struct sock *sk = sock->sk; + + listen_sock.sk_data_ready = sk->sk_data_ready; + listen_sock.sk_state_change = sk->sk_state_change; + listen_sock.sk_write_space = sk->sk_write_space; + listen_sock.sk_error_report = sk->sk_error_report; } -static void restore_callbacks(struct connection *con, struct sock *sk) +static void restore_callbacks(struct socket *sock) { + struct sock *sk = sock->sk; + write_lock_bh(&sk->sk_callback_lock); sk->sk_user_data = NULL; - sk->sk_data_ready = con->orig_data_ready; - sk->sk_state_change = con->orig_state_change; - sk->sk_write_space = con->orig_write_space; - sk->sk_error_report = con->orig_error_report; + sk->sk_data_ready = listen_sock.sk_data_ready; + sk->sk_state_change = listen_sock.sk_state_change; + sk->sk_write_space = listen_sock.sk_write_space; + sk->sk_error_report = listen_sock.sk_error_report; write_unlock_bh(&sk->sk_callback_lock); } /* Make a socket active */ -static void add_sock(struct socket *sock, struct connection *con, bool save_cb) +static void add_sock(struct socket *sock, struct connection *con) { struct sock *sk = sock->sk; @@ -545,8 +559,6 @@ static void add_sock(struct socket *sock, struct connection *con, bool save_cb) con->sock = sock; sk->sk_user_data = con; - if (save_cb) - save_callbacks(con, sk); /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_data_ready; sk->sk_write_space = lowcomms_write_space; @@ -579,17 +591,20 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, static void close_connection(struct connection *con, bool and_other, bool tx, bool rx) { - clear_bit(CF_CONNECT_PENDING, &con->flags); - clear_bit(CF_WRITE_PENDING, &con->flags); - if (tx && cancel_work_sync(&con->swork)) + bool closing = test_and_set_bit(CF_CLOSING, &con->flags); + + if (tx && !closing && cancel_work_sync(&con->swork)) { log_print("canceled swork for node %d", con->nodeid); - if (rx && cancel_work_sync(&con->rwork)) + clear_bit(CF_WRITE_PENDING, &con->flags); + } + if (rx && !closing && cancel_work_sync(&con->rwork)) { log_print("canceled rwork for node %d", con->nodeid); + clear_bit(CF_READ_PENDING, &con->flags); + } mutex_lock(&con->sock_mutex); if (con->sock) { - if (!test_bit(CF_IS_OTHERCON, &con->flags)) - restore_callbacks(con, con->sock->sk); + restore_callbacks(con->sock); sock_release(con->sock); con->sock = NULL; } @@ -604,6 +619,7 @@ static void close_connection(struct connection *con, bool and_other, con->retries = 0; mutex_unlock(&con->sock_mutex); + clear_bit(CF_CLOSING, &con->flags); } /* Data received from remote end */ @@ -700,7 +716,7 @@ out_resched: out_close: mutex_unlock(&con->sock_mutex); if (ret != -EAGAIN) { - close_connection(con, false, true, false); + close_connection(con, true, true, false); /* Reconnect when there is something to send */ } /* Don't return success if we really got EOF */ @@ -728,22 +744,14 @@ static int tcp_accept_from_sock(struct connection *con) } mutex_unlock(&connections_lock); - memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_lite(dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_TCP, &newsock); - if (result < 0) - return -ENOMEM; - mutex_lock_nested(&con->sock_mutex, 0); - result = -ENOTCONN; - if (con->sock == NULL) - goto accept_err; - - newsock->type = con->sock->type; - newsock->ops = con->sock->ops; + if (!con->sock) { + mutex_unlock(&con->sock_mutex); + return -ENOTCONN; + } - result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK, true); + result = kernel_accept(con->sock, &newsock, O_NONBLOCK); if (result < 0) goto accept_err; @@ -794,31 +802,33 @@ static int tcp_accept_from_sock(struct connection *con) othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); + INIT_LIST_HEAD(&othercon->writequeue); + spin_lock_init(&othercon->writequeue_lock); INIT_WORK(&othercon->swork, process_send_sockets); INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); } + mutex_lock_nested(&othercon->sock_mutex, 2); if (!othercon->sock) { newcon->othercon = othercon; - othercon->sock = newsock; - newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon, false); + add_sock(newsock, othercon); addcon = othercon; + mutex_unlock(&othercon->sock_mutex); } else { printk("Extra connection from node %d attempted\n", nodeid); result = -EAGAIN; + mutex_unlock(&othercon->sock_mutex); mutex_unlock(&newcon->sock_mutex); goto accept_err; } } else { - newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will result in calling sk_error_report recursively. */ - add_sock(newsock, newcon, false); + add_sock(newsock, newcon); addcon = newcon; } @@ -837,7 +847,8 @@ static int tcp_accept_from_sock(struct connection *con) accept_err: mutex_unlock(&con->sock_mutex); - sock_release(newsock); + if (newsock) + sock_release(newsock); if (result != -EAGAIN) log_print("error accepting connection from node: %d", result); @@ -911,26 +922,28 @@ static int sctp_accept_from_sock(struct connection *con) othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); + INIT_LIST_HEAD(&othercon->writequeue); + spin_lock_init(&othercon->writequeue_lock); INIT_WORK(&othercon->swork, process_send_sockets); INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); } + mutex_lock_nested(&othercon->sock_mutex, 2); if (!othercon->sock) { newcon->othercon = othercon; - othercon->sock = newsock; - newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon, false); + add_sock(newsock, othercon); addcon = othercon; + mutex_unlock(&othercon->sock_mutex); } else { printk("Extra connection from node %d attempted\n", nodeid); ret = -EAGAIN; + mutex_unlock(&othercon->sock_mutex); mutex_unlock(&newcon->sock_mutex); goto accept_err; } } else { - newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; - add_sock(newsock, newcon, false); + add_sock(newsock, newcon); addcon = newcon; } @@ -1055,10 +1068,9 @@ static void sctp_connect_to_sock(struct connection *con) if (result < 0) goto socket_err; - sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = sctp_connect_to_sock; - add_sock(sock, con, true); + add_sock(sock, con); /* Bind to all addresses. */ if (sctp_bind_addrs(con, 0)) @@ -1079,7 +1091,6 @@ static void sctp_connect_to_sock(struct connection *con) if (result == 0) goto out; - bind_err: con->sock = NULL; sock_release(sock); @@ -1098,14 +1109,12 @@ socket_err: con->retries, result); mutex_unlock(&con->sock_mutex); msleep(1000); - clear_bit(CF_CONNECT_PENDING, &con->flags); lowcomms_connect_sock(con); return; } out: mutex_unlock(&con->sock_mutex); - set_bit(CF_WRITE_PENDING, &con->flags); } /* Connect a new socket to its peer */ @@ -1143,10 +1152,9 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; } - sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = tcp_connect_to_sock; - add_sock(sock, con, true); + add_sock(sock, con); /* Bind to our cluster-known address connecting to avoid routing problems */ @@ -1194,13 +1202,11 @@ out_err: con->retries, result); mutex_unlock(&con->sock_mutex); msleep(1000); - clear_bit(CF_CONNECT_PENDING, &con->flags); lowcomms_connect_sock(con); return; } out: mutex_unlock(&con->sock_mutex); - set_bit(CF_WRITE_PENDING, &con->flags); return; } @@ -1235,10 +1241,12 @@ static struct socket *tcp_create_listen_sock(struct connection *con, if (result < 0) { log_print("Failed to set SO_REUSEADDR on socket: %d", result); } + write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = con; - + save_listen_callbacks(sock); con->rx_action = tcp_accept_from_sock; con->connect_action = tcp_connect_to_sock; + write_unlock_bh(&sock->sk->sk_callback_lock); /* Bind to our port */ make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); @@ -1320,6 +1328,7 @@ static int sctp_listen_for_all(void) write_lock_bh(&sock->sk->sk_callback_lock); /* Init con struct */ sock->sk->sk_user_data = con; + save_listen_callbacks(sock); con->sock = sock; con->sock->sk->sk_data_ready = lowcomms_data_ready; con->rx_action = sctp_accept_from_sock; @@ -1366,7 +1375,7 @@ static int tcp_listen_for_all(void) sock = tcp_create_listen_sock(con, dlm_local_addr[0]); if (sock) { - add_sock(sock, con, true); + add_sock(sock, con); result = 0; } else { @@ -1456,9 +1465,7 @@ void dlm_lowcomms_commit_buffer(void *mh) e->len = e->end - e->offset; spin_unlock(&con->writequeue_lock); - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { - queue_work(send_workqueue, &con->swork); - } + queue_work(send_workqueue, &con->swork); return; out: @@ -1527,13 +1534,16 @@ out: send_error: mutex_unlock(&con->sock_mutex); - close_connection(con, false, false, true); - lowcomms_connect_sock(con); + close_connection(con, true, false, true); + /* Requeue the send work. When the work daemon runs again, it will try + a new connection, then call this function again. */ + queue_work(send_workqueue, &con->swork); return; out_connect: mutex_unlock(&con->sock_mutex); - lowcomms_connect_sock(con); + queue_work(send_workqueue, &con->swork); + cond_resched(); } static void clean_one_writequeue(struct connection *con) @@ -1593,9 +1603,10 @@ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); - if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) + clear_bit(CF_WRITE_PENDING, &con->flags); + if (con->sock == NULL) /* not mutex protected so check it inside too */ con->connect_action(con); - if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags)) + if (!list_empty(&con->writequeue)) send_to_sock(con); } @@ -1632,11 +1643,25 @@ static int work_start(void) return 0; } -static void stop_conn(struct connection *con) +static void _stop_conn(struct connection *con, bool and_other) { - con->flags |= 0x0F; - if (con->sock && con->sock->sk) + mutex_lock(&con->sock_mutex); + set_bit(CF_CLOSE, &con->flags); + set_bit(CF_READ_PENDING, &con->flags); + set_bit(CF_WRITE_PENDING, &con->flags); + if (con->sock && con->sock->sk) { + write_lock_bh(&con->sock->sk->sk_callback_lock); con->sock->sk->sk_user_data = NULL; + write_unlock_bh(&con->sock->sk->sk_callback_lock); + } + if (con->othercon && and_other) + _stop_conn(con->othercon, false); + mutex_unlock(&con->sock_mutex); +} + +static void stop_conn(struct connection *con) +{ + _stop_conn(con, true); } static void free_conn(struct connection *con) @@ -1648,6 +1673,36 @@ static void free_conn(struct connection *con) kmem_cache_free(con_cache, con); } +static void work_flush(void) +{ + int ok; + int i; + struct hlist_node *n; + struct connection *con; + + flush_workqueue(recv_workqueue); + flush_workqueue(send_workqueue); + do { + ok = 1; + foreach_conn(stop_conn); + flush_workqueue(recv_workqueue); + flush_workqueue(send_workqueue); + for (i = 0; i < CONN_HASH_SIZE && ok; i++) { + hlist_for_each_entry_safe(con, n, + &connection_hash[i], list) { + ok &= test_bit(CF_READ_PENDING, &con->flags); + ok &= test_bit(CF_WRITE_PENDING, &con->flags); + if (con->othercon) { + ok &= test_bit(CF_READ_PENDING, + &con->othercon->flags); + ok &= test_bit(CF_WRITE_PENDING, + &con->othercon->flags); + } + } + } + } while (!ok); +} + void dlm_lowcomms_stop(void) { /* Set all the flags to prevent any @@ -1655,11 +1710,10 @@ void dlm_lowcomms_stop(void) */ mutex_lock(&connections_lock); dlm_allow_conn = 0; - foreach_conn(stop_conn); + mutex_unlock(&connections_lock); + work_flush(); clean_writequeues(); foreach_conn(free_conn); - mutex_unlock(&connections_lock); - work_stop(); kmem_cache_destroy(con_cache); diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f3f5e72a29ba..70c625999d36 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -155,6 +155,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) goto out; } +retry: error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, sizeof(struct rcom_status), &rc, &mh); if (error) @@ -169,6 +170,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); + if (error == -ETIMEDOUT) + goto retry; if (error) goto out; @@ -276,6 +279,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) ls->ls_recover_nodeid = nodeid; +retry: error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); if (error) goto out; @@ -288,6 +292,8 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); + if (error == -ETIMEDOUT) + goto retry; out: return error; } @@ -332,25 +338,6 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) return error; } -int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid) -{ - struct dlm_rcom *rc; - struct dlm_mhandle *mh; - struct dlm_ls *ls = r->res_ls; - int error; - - error = create_rcom(ls, to_nodeid, DLM_RCOM_LOOKUP, r->res_length, - &rc, &mh); - if (error) - goto out; - memcpy(rc->rc_buf, r->res_name, r->res_length); - rc->rc_id = 0xFFFFFFFF; - - send_rcom(ls, mh, rc); - out: - return error; -} - static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; @@ -362,6 +349,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) if (error) return; + /* Old code would send this special id to trigger a debug dump. */ if (rc_in->rc_id == 0xFFFFFFFF) { log_error(ls, "receive_rcom_lookup dump from %d", nodeid); dlm_dump_rsb_name(ls, rc_in->rc_buf, len); diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h index f8e243463c15..206723ab744d 100644 --- a/fs/dlm/rcom.h +++ b/fs/dlm/rcom.h @@ -17,7 +17,6 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); -int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid); int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid); int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index eaea789bf97d..ce2aa54ca2e2 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -52,6 +52,10 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) dlm_config.ci_recover_timer * HZ); if (rv) break; + if (test_bit(LSFL_RCOM_WAIT, &ls->ls_flags)) { + log_debug(ls, "dlm_wait_function timed out"); + return -ETIMEDOUT; + } } if (dlm_recovery_stopped(ls)) { diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 6859b4bf971e..6f4e1d42d733 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -287,11 +287,23 @@ static int dlm_recoverd(void *arg) set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); wake_up(&ls->ls_recover_lock_wait); - while (!kthread_should_stop()) { + while (1) { + /* + * We call kthread_should_stop() after set_current_state(). + * This is because it works correctly if kthread_stop() is + * called just before set_current_state(). + */ set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) && - !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) + !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { + if (kthread_should_stop()) + break; schedule(); + } set_current_state(TASK_RUNNING); if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d72d52b90433..82377017130f 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Implement the manual drop-all-pagecache function */ diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 9c351bf757b2..3fbc0ff79699 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -84,11 +84,16 @@ struct ecryptfs_page_crypt_context { static inline struct ecryptfs_auth_tok * ecryptfs_get_encrypted_key_payload_data(struct key *key) { - if (key->type == &key_type_encrypted) - return (struct ecryptfs_auth_tok *) - (&((struct encrypted_key_payload *)key->payload.data[0])->payload_data); - else + struct encrypted_key_payload *payload; + + if (key->type != &key_type_encrypted) return NULL; + + payload = key->payload.data[0]; + if (!payload) + return ERR_PTR(-EKEYREVOKED); + + return (struct ecryptfs_auth_tok *)payload->payload_data; } static inline struct key *ecryptfs_get_encrypted_key(char *sig) @@ -114,12 +119,17 @@ static inline struct ecryptfs_auth_tok * ecryptfs_get_key_payload_data(struct key *key) { struct ecryptfs_auth_tok *auth_tok; + struct user_key_payload *ukp; auth_tok = ecryptfs_get_encrypted_key_payload_data(key); - if (!auth_tok) - return (struct ecryptfs_auth_tok *)user_key_payload_locked(key)->data; - else + if (auth_tok) return auth_tok; + + ukp = user_key_payload_locked(key); + if (!ukp) + return ERR_PTR(-EKEYREVOKED); + + return (struct ecryptfs_auth_tok *)ukp->data; } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 3cf1546dca82..fa218cd64f74 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -459,7 +459,8 @@ out: * @auth_tok_key: key containing the authentication token * @auth_tok: authentication token * - * Returns zero on valid auth tok; -EINVAL otherwise + * Returns zero on valid auth tok; -EINVAL if the payload is invalid; or + * -EKEYREVOKED if the key was revoked before we acquired its semaphore. */ static int ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key, @@ -468,6 +469,12 @@ ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key, int rc = 0; (*auth_tok) = ecryptfs_get_key_payload_data(auth_tok_key); + if (IS_ERR(*auth_tok)) { + rc = PTR_ERR(*auth_tok); + *auth_tok = NULL; + goto out; + } + if (ecryptfs_verify_version((*auth_tok)->version)) { printk(KERN_ERR "Data structure version mismatch. Userspace " "tools must match eCryptfs kernel module with major " diff --git a/fs/efs/dir.c b/fs/efs/dir.c index a7be96e5f1cb..f892ac7c2a35 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * dir.c * diff --git a/fs/efs/efs.h b/fs/efs/efs.h index 70f5d4f9a945..13a4d9622633 100644 --- a/fs/efs/efs.h +++ b/fs/efs/efs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 1999 Al Smith * diff --git a/fs/efs/file.c b/fs/efs/file.c index a37dcee46866..9e641da6fab2 100644 --- a/fs/efs/file.c +++ b/fs/efs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * file.c * diff --git a/fs/efs/namei.c b/fs/efs/namei.c index d34a40edcdb2..38961ee1d1af 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * namei.c * diff --git a/fs/efs/super.c b/fs/efs/super.c index 5c42f1e34a2f..65b59009555b 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * super.c * diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c index 4870cc82deb0..923eb91654d5 100644 --- a/fs/efs/symlink.c +++ b/fs/efs/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * symlink.c * diff --git a/fs/exec.c b/fs/exec.c index 5470d3c1892a..1d6243d9f2b6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1802,6 +1802,7 @@ static int do_execveat_common(int fd, struct filename *filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; + membarrier_execve(current); acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); @@ -1910,7 +1911,7 @@ void set_dumpable(struct mm_struct *mm, int value) return; do { - old = ACCESS_ONCE(mm->flags); + old = READ_ONCE(mm->flags); new = (old & ~MMF_DUMPABLE_MASK) | value; } while (cmpxchg(&mm->flags, old, new) != old); } diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile index 445b0e996a12..311479d864a7 100644 --- a/fs/ext2/Makefile +++ b/fs/ext2/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux ext2-filesystem routines. # diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 51f0aea70cb4..224c04abb2e5 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/acl.c * diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 44937f9fcf32..0f01c759daac 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* File: fs/ext2/acl.h diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index d0bdb74f0e15..e1b3724bebf2 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/balloc.c * diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index e2709695b177..987647986f47 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/dir.c * diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 28de3edd4f4d..032295e1d386 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) diff --git a/fs/ext2/file.c b/fs/ext2/file.c index ff3a3636a5ca..c67b486488fd 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/file.c * diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 395fc074c0db..a1fc3dabca41 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/ialloc.c * diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 4dca6f348714..9b2ac55ac34f 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/inode.c * @@ -820,11 +821,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret == 0) { iomap->type = IOMAP_HOLE; - iomap->blkno = IOMAP_NULL_BLOCK; + iomap->addr = IOMAP_NULL_ADDR; iomap->length = 1 << blkbits; } else { iomap->type = IOMAP_MAPPED; - iomap->blkno = (sector_t)bno << (blkbits - 9); + iomap->addr = (u64)bno << blkbits; iomap->length = (u64)ret << blkbits; iomap->flags |= IOMAP_F_MERGED; } diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 087f122cca42..0367c0039e68 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/ioctl.c * diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 814e405a2da6..e078075dc66f 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/namei.c * diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index eeffb0138a17..d5589ddcc281 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/symlink.c * diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 1b9b1268d418..62d9a659a8ff 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/xattr.c * diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 6f82ab1b00ca..cee888cdc235 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/ext2_xattr.h diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 7b9e9c1842d5..9a682e440acb 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/xattr_security.c * Handler for storing security labels as extended attributes. diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 65049b71af13..49add1107850 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/xattr_trusted.c * Handler for trusted extended attributes. diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index fb2f992ae763..c243a3b4d69d 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext2/xattr_user.c * Handler for extended user attributes. diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index e38039fd96ff..73b850f5659c 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -37,6 +37,7 @@ config EXT4_FS select CRC16 select CRYPTO select CRYPTO_CRC32C + select FS_IOMAP help This is the next generation of the ext3 filesystem. diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index d9beca1653c5..8fdfcd3c3e04 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux ext4-filesystem routines. # diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 46ff2229ff5e..fb50f9aa6ead 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/acl.c * diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index da2c79577d72..a48fc5ae2701 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* File: fs/ext4/acl.h diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index e04ec868e37e..a943e568292e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/balloc.c * @@ -600,22 +601,21 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi, * ext4_should_retry_alloc() is called when ENOSPC is returned, and if * it is profitable to retry the operation, this function will wait * for the current or committing transaction to complete, and then - * return TRUE. - * - * if the total number of retries exceed three times, return FALSE. + * return TRUE. We will only retry once. */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || - (*retries)++ > 3 || + (*retries)++ > 1 || !EXT4_SB(sb)->s_journal) return 0; - jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - smp_mb(); - if (EXT4_SB(sb)->s_mb_free_pending) - jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + if (EXT4_SB(sb)->s_mb_free_pending == 0) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); return 1; } diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 4a606afb171f..f63e028c638c 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/bitmap.c * diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index fdb19543af1e..bee888e0e2db 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/block_validity.c * diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index b04e882179c6..d5babc9f222b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/dir.c * diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e2abe01c8c6b..4e091eae38b1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ext4.h * @@ -33,17 +34,15 @@ #include <linux/percpu_counter.h> #include <linux/ratelimit.h> #include <crypto/hash.h> -#ifdef CONFIG_EXT4_FS_ENCRYPTION -#include <linux/fscrypt_supp.h> -#else -#include <linux/fscrypt_notsupp.h> -#endif #include <linux/falloc.h> #include <linux/percpu-rwsem.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION) +#include <linux/fscrypt.h> + /* * The fourth extended filesystem constants/structures */ @@ -545,8 +544,8 @@ struct ext4_new_group_data { __u64 inode_table; __u32 blocks_count; __u16 reserved_blocks; - __u16 unused; - __u32 free_blocks_count; + __u16 mdata_blocks; + __u32 free_clusters_count; }; /* Indexes used to index group tables in ext4_new_group_data */ @@ -644,43 +643,6 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY -#ifndef FS_IOC_FSGETXATTR -/* Until the uapi changes get merged for project quota... */ - -#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) -#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) - -/* - * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. - */ -struct fsxattr { - __u32 fsx_xflags; /* xflags field value (get/set) */ - __u32 fsx_extsize; /* extsize field value (get/set)*/ - __u32 fsx_nextents; /* nextents field value (get) */ - __u32 fsx_projid; /* project identifier (get/set) */ - unsigned char fsx_pad[12]; -}; - -/* - * Flags for the fsx_xflags field - */ -#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ -#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ -#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ -#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ -#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ -#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ -#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ -#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ -#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ -#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ -#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ -#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ -#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ -#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ -#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ -#endif /* !defined(FS_IOC_FSGETXATTR) */ - #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -1392,8 +1354,6 @@ struct ext4_sb_info { int s_first_ino; unsigned int s_inode_readahead_blks; unsigned int s_inode_goal; - spinlock_t s_next_gen_lock; - u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ @@ -2515,9 +2475,6 @@ extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); -extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, - unsigned int map_len, - struct extent_status *result); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, @@ -3048,6 +3005,10 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline, __u64 start, __u64 len); + +struct iomap; +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); + extern int ext4_try_to_evict_inline_data(handle_t *handle, struct inode *inode, int needed); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 5b342ac67d2e..2d593201cf7a 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Interface between ext4 and JBD */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 97f0fd06728d..07bca11749d4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4794,7 +4794,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, } if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { + (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) @@ -4965,7 +4966,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { + (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e7f12a204cbc..763ef185dd17 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.c * diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f7aa24f4642d..ca90fc96f47e 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/ext4/extents_status.h * diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b1da660ac3bc..ad204d2724ac 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/file.c * @@ -20,6 +21,7 @@ #include <linux/time.h> #include <linux/fs.h> +#include <linux/iomap.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/dax.h> @@ -364,7 +366,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct vfsmount *mnt = filp->f_path.mnt; - struct dentry *dir; struct path path; char buf[64], *cp; int ret; @@ -404,25 +405,11 @@ static int ext4_file_open(struct inode * inode, struct file * filp) ext4_journal_stop(handle); } } - if (ext4_encrypted_inode(inode)) { - ret = fscrypt_get_encryption_info(inode); - if (ret) - return -EACCES; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - dir = dget_parent(file_dentry(filp)); - if (ext4_encrypted_inode(d_inode(dir)) && - !fscrypt_has_permitted_context(d_inode(dir), inode)) { - ext4_warning(inode->i_sb, - "Inconsistent encryption contexts: %lu/%lu", - (unsigned long) d_inode(dir)->i_ino, - (unsigned long) inode->i_ino); - dput(dir); - return -EPERM; - } - dput(dir); + ret = fscrypt_file_open(inode, filp); + if (ret) + return ret; + /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present @@ -438,248 +425,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) } /* - * Here we use ext4_map_blocks() to get a block mapping for a extent-based - * file rather than ext4_ext_walk_space() because we can introduce - * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same - * function. When extent status tree has been fully implemented, it will - * track all extent status for a file and we can directly use it to - * retrieve the offset for SEEK_DATA/SEEK_HOLE. - */ - -/* - * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to - * lookup page cache to check whether or not there has some data between - * [startoff, endoff] because, if this range contains an unwritten extent, - * we determine this extent as a data or a hole according to whether the - * page cache has data or not. - */ -static int ext4_find_unwritten_pgoff(struct inode *inode, - int whence, - ext4_lblk_t end_blk, - loff_t *offset) -{ - struct pagevec pvec; - unsigned int blkbits; - pgoff_t index; - pgoff_t end; - loff_t endoff; - loff_t startoff; - loff_t lastoff; - int found = 0; - - blkbits = inode->i_sb->s_blocksize_bits; - startoff = *offset; - lastoff = startoff; - endoff = (loff_t)end_blk << blkbits; - - index = startoff >> PAGE_SHIFT; - end = (endoff - 1) >> PAGE_SHIFT; - - pagevec_init(&pvec, 0); - do { - int i; - unsigned long nr_pages; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, - &index, end); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - /* - * If current offset is smaller than the page offset, - * there is a hole at this offset. - */ - if (whence == SEEK_HOLE && lastoff < endoff && - lastoff < page_offset(pvec.pages[i])) { - found = 1; - *offset = lastoff; - goto out; - } - - lock_page(page); - - if (unlikely(page->mapping != inode->i_mapping)) { - unlock_page(page); - continue; - } - - if (!page_has_buffers(page)) { - unlock_page(page); - continue; - } - - if (page_has_buffers(page)) { - lastoff = page_offset(page); - bh = head = page_buffers(page); - do { - if (lastoff + bh->b_size <= startoff) - goto next; - if (buffer_uptodate(bh) || - buffer_unwritten(bh)) { - if (whence == SEEK_DATA) - found = 1; - } else { - if (whence == SEEK_HOLE) - found = 1; - } - if (found) { - *offset = max_t(loff_t, - startoff, lastoff); - unlock_page(page); - goto out; - } -next: - lastoff += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); - } - - lastoff = page_offset(page) + PAGE_SIZE; - unlock_page(page); - } - - pagevec_release(&pvec); - } while (index <= end); - - /* There are no pages upto endoff - that would be a hole in there. */ - if (whence == SEEK_HOLE && lastoff < endoff) { - found = 1; - *offset = lastoff; - } -out: - pagevec_release(&pvec); - return found; -} - -/* - * ext4_seek_data() retrieves the offset for SEEK_DATA. - */ -static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) -{ - struct inode *inode = file->f_mapping->host; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t dataoff, isize; - int blkbits; - int ret; - - inode_lock(inode); - - isize = i_size_read(inode); - if (offset < 0 || offset >= isize) { - inode_unlock(inode); - return -ENXIO; - } - - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - dataoff = offset; - - do { - ret = ext4_get_next_extent(inode, last, end - last + 1, &es); - if (ret <= 0) { - /* No extent found -> no data */ - if (ret == 0) - ret = -ENXIO; - inode_unlock(inode); - return ret; - } - - last = es.es_lblk; - if (last != start) - dataoff = (loff_t)last << blkbits; - if (!ext4_es_is_unwritten(&es)) - break; - - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, - es.es_lblk + es.es_len, &dataoff)) - break; - last += es.es_len; - dataoff = (loff_t)last << blkbits; - cond_resched(); - } while (last <= end); - - inode_unlock(inode); - - if (dataoff > isize) - return -ENXIO; - - return vfs_setpos(file, dataoff, maxsize); -} - -/* - * ext4_seek_hole() retrieves the offset for SEEK_HOLE. - */ -static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) -{ - struct inode *inode = file->f_mapping->host; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t holeoff, isize; - int blkbits; - int ret; - - inode_lock(inode); - - isize = i_size_read(inode); - if (offset < 0 || offset >= isize) { - inode_unlock(inode); - return -ENXIO; - } - - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - holeoff = offset; - - do { - ret = ext4_get_next_extent(inode, last, end - last + 1, &es); - if (ret < 0) { - inode_unlock(inode); - return ret; - } - /* Found a hole? */ - if (ret == 0 || es.es_lblk > last) { - if (last != start) - holeoff = (loff_t)last << blkbits; - break; - } - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (ext4_es_is_unwritten(&es) && - ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - last + es.es_len, &holeoff)) - break; - - last += es.es_len; - holeoff = (loff_t)last << blkbits; - cond_resched(); - } while (last <= end); - - inode_unlock(inode); - - if (holeoff > isize) - holeoff = isize; - - return vfs_setpos(file, holeoff, maxsize); -} - -/* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. @@ -695,18 +440,24 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes = inode->i_sb->s_maxbytes; switch (whence) { - case SEEK_SET: - case SEEK_CUR: - case SEEK_END: + default: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); - case SEEK_DATA: - return ext4_seek_data(file, offset, maxbytes); case SEEK_HOLE: - return ext4_seek_hole(file, offset, maxbytes); + inode_lock_shared(inode); + offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); + inode_unlock_shared(inode); + break; + case SEEK_DATA: + inode_lock_shared(inode); + offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); + inode_unlock_shared(inode); + break; } - return -EINVAL; + if (offset < 0) + return offset; + return vfs_setpos(file, offset, maxbytes); } const struct file_operations ext4_file_operations = { diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index f9230580a84b..26a7fe5c4fd3 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/fsync.c * diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ee823022aa34..b4267d72f249 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ialloc.c * @@ -1138,9 +1139,7 @@ got: inode->i_ino); goto out; } - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); + inode->i_generation = prandom_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 7ffa290cbb8e..c32802c956d5 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/indirect.c * diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 28c5c3abddb3..1367553c43bb 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -12,6 +12,7 @@ * GNU General Public License for more details. */ +#include <linux/iomap.h> #include <linux/fiemap.h> #include "ext4_jbd2.h" @@ -302,11 +303,6 @@ static int ext4_create_inline_data(handle_t *handle, EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); - /* - * Propagate changes to inode->i_flags as well - e.g. S_DAX may - * get cleared - */ - ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -451,11 +447,6 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); - /* - * Propagate changes to inode->i_flags as well - e.g. S_DAX may - * get set. - */ - ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -1827,6 +1818,38 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) return ret; } +int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap) +{ + __u64 addr; + int error = -EAGAIN; + struct ext4_iloc iloc; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) + goto out; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + goto out; + + addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; + addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; + addr += offsetof(struct ext4_inode, i_block); + + brelse(iloc.bh); + + iomap->addr = addr; + iomap->offset = 0; + iomap->length = min_t(loff_t, ext4_get_inline_size(inode), + i_size_read(inode)); + iomap->type = 0; + iomap->flags = IOMAP_F_DATA_INLINE; + +out: + up_read(&EXT4_I(inode)->xattr_sem); + return error; +} + int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline, __u64 start, __u64 len) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 31db875bc7a1..2633150e41b9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/inode.c * @@ -3393,7 +3394,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } -#ifdef CONFIG_FS_DAX static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { @@ -3402,17 +3402,54 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned long first_block = offset >> blkbits; unsigned long last_block = (offset + length - 1) >> blkbits; struct ext4_map_blocks map; + bool delalloc = false; int ret; - if (WARN_ON_ONCE(ext4_has_inline_data(inode))) - return -ERANGE; + + if (flags & IOMAP_REPORT) { + if (ext4_has_inline_data(inode)) { + ret = ext4_inline_data_iomap(inode, iomap); + if (ret != -EAGAIN) { + if (ret == 0 && offset >= iomap->length) + ret = -ENOENT; + return ret; + } + } + } else { + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; + } map.m_lblk = first_block; map.m_len = last_block - first_block + 1; - if (!(flags & IOMAP_WRITE)) { + if (flags & IOMAP_REPORT) { ret = ext4_map_blocks(NULL, inode, &map, 0); - } else { + if (ret < 0) + return ret; + + if (ret == 0) { + ext4_lblk_t end = map.m_lblk + map.m_len - 1; + struct extent_status es; + + ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); + + if (!es.es_len || es.es_lblk > end) { + /* entire range is a hole */ + } else if (es.es_lblk > map.m_lblk) { + /* range starts with a hole */ + map.m_len = es.es_lblk - map.m_lblk; + } else { + ext4_lblk_t offs = 0; + + if (es.es_lblk < map.m_lblk) + offs = map.m_lblk - es.es_lblk; + map.m_lblk = es.es_lblk + offs; + map.m_len = es.es_len - offs; + delalloc = true; + } + } + } else if (flags & IOMAP_WRITE) { int dio_credits; handle_t *handle; int retries = 0; @@ -3463,17 +3500,21 @@ retry: } } ext4_journal_stop(handle); + } else { + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; } iomap->flags = 0; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; iomap->offset = first_block << blkbits; + iomap->length = (u64)map.m_len << blkbits; if (ret == 0) { - iomap->type = IOMAP_HOLE; - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->length = (u64)map.m_len << blkbits; + iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; } else { if (map.m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; @@ -3483,12 +3524,12 @@ retry: WARN_ON_ONCE(1); return -EIO; } - iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9); - iomap->length = (u64)map.m_len << blkbits; + iomap->addr = (u64)map.m_pblk << blkbits; } if (map.m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; + return 0; } @@ -3549,8 +3590,6 @@ const struct iomap_ops ext4_iomap_ops = { .iomap_end = ext4_iomap_end, }; -#endif - static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { @@ -4572,6 +4611,21 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); } +static bool ext4_should_use_dax(struct inode *inode) +{ + if (!test_opt(inode->i_sb, DAX)) + return false; + if (!S_ISREG(inode->i_mode)) + return false; + if (ext4_should_journal_data(inode)) + return false; + if (ext4_has_inline_data(inode)) + return false; + if (ext4_encrypted_inode(inode)) + return false; + return true; +} + void ext4_set_inode_flags(struct inode *inode) { unsigned int flags = EXT4_I(inode)->i_flags; @@ -4587,12 +4641,13 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) && - !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) && - !ext4_encrypted_inode(inode)) + if (ext4_should_use_dax(inode)) new_fl |= S_DAX; + if (flags & EXT4_ENCRYPT_FL) + new_fl |= S_ENCRYPTED; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| + S_ENCRYPTED); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, @@ -5308,6 +5363,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + error = fscrypt_prepare_setattr(dentry, attr); + if (error) + return error; + if (is_quota_modification(inode, attr)) { error = dquot_initialize(inode); if (error) @@ -5353,14 +5412,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; int shrink = (attr->ia_size <= inode->i_size); - if (ext4_encrypted_inode(inode)) { - error = fscrypt_get_encryption_info(inode); - if (error) - return error; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -5966,11 +6017,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); - /* - * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated. - * E.g. S_DAX may get cleared / set. - */ - ext4_set_inode_flags(inode); jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_journal_flag_rwsem); @@ -6106,70 +6152,3 @@ int ext4_filemap_fault(struct vm_fault *vmf) return err; } - -/* - * Find the first extent at or after @lblk in an inode that is not a hole. - * Search for @map_len blocks at most. The extent is returned in @result. - * - * The function returns 1 if we found an extent. The function returns 0 in - * case there is no extent at or after @lblk and in that case also sets - * @result->es_len to 0. In case of error, the error code is returned. - */ -int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, - unsigned int map_len, struct extent_status *result) -{ - struct ext4_map_blocks map; - struct extent_status es = {}; - int ret; - - map.m_lblk = lblk; - map.m_len = map_len; - - /* - * For non-extent based files this loop may iterate several times since - * we do not determine full hole size. - */ - while (map.m_len > 0) { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - /* There's extent covering m_lblk? Just return it. */ - if (ret > 0) { - int status; - - ext4_es_store_pblock(result, map.m_pblk); - result->es_lblk = map.m_lblk; - result->es_len = map.m_len; - if (map.m_flags & EXT4_MAP_UNWRITTEN) - status = EXTENT_STATUS_UNWRITTEN; - else - status = EXTENT_STATUS_WRITTEN; - ext4_es_store_status(result, status); - return 1; - } - ext4_es_find_delayed_extent_range(inode, map.m_lblk, - map.m_lblk + map.m_len - 1, - &es); - /* Is delalloc data before next block in extent tree? */ - if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) { - ext4_lblk_t offset = 0; - - if (es.es_lblk < lblk) - offset = lblk - es.es_lblk; - result->es_lblk = es.es_lblk + offset; - ext4_es_store_pblock(result, - ext4_es_pblock(&es) + offset); - result->es_len = es.es_len - offset; - ext4_es_store_status(result, ext4_es_status(&es)); - - return 1; - } - /* There's a hole at m_lblk, advance us after it */ - map.m_lblk += map.m_len; - map_len -= map.m_len; - map.m_len = map_len; - cond_resched(); - } - result->es_len = 0; - return 0; -} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index afb66d4ab5cf..b7558f292420 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ioctl.c * @@ -14,6 +15,7 @@ #include <linux/mount.h> #include <linux/file.h> #include <linux/quotaops.h> +#include <linux/random.h> #include <linux/uuid.h> #include <linux/uaccess.h> #include <linux/delay.h> @@ -98,7 +100,6 @@ static long swap_inode_boot_loader(struct super_block *sb, int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; - struct ext4_sb_info *sbi = EXT4_SB(sb); if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) return -EINVAL; @@ -157,10 +158,8 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_ctime = inode_bl->i_ctime = current_time(inode); - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - inode_bl->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); + inode->i_generation = prandom_u32(); + inode_bl->i_generation = prandom_u32(); ext4_discard_preallocations(inode); @@ -290,10 +289,20 @@ flags_err: if (err) goto flags_out; - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + /* + * Changes to the journaling mode can cause unsafe changes to + * S_DAX if we are using the DAX mount option. + */ + if (test_opt(inode->i_sb, DAX)) { + err = -EBUSY; + goto flags_out; + } + err = ext4_change_inode_journal_flag(inode, jflag); - if (err) - goto flags_out; + if (err) + goto flags_out; + } if (migrate) { if (flags & EXT4_EXTENTS_FL) err = ext4_ext_migrate(inode); @@ -861,12 +870,6 @@ group_add_out: int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not (yet) supported with bigalloc"); - return -EOPNOTSUPP; - } - if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, sizeof(__u64))) { return -EFAULT; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 701085620cd8..d9f8b90a93ed 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4994,8 +4994,11 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; - int err = 0, ret, blk_free_count; - ext4_grpblk_t blocks_freed; + int err = 0, ret, free_clusters_count; + ext4_grpblk_t clusters_freed; + ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); + ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); + unsigned long cluster_count = last_cluster - first_cluster + 1; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); @@ -5007,8 +5010,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - ext4_warning(sb, "too much blocks added to group %u", + if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { + ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; goto error_return; @@ -5054,14 +5057,14 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (err) goto error_return; - for (i = 0, blocks_freed = 0; i < count; i++) { + for (i = 0, clusters_freed = 0; i < cluster_count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { ext4_error(sb, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { - blocks_freed++; + clusters_freed++; } } @@ -5075,19 +5078,20 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * them with group lock_held */ ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); - mb_free_blocks(NULL, &e4b, bit, count); - blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); - ext4_free_group_clusters_set(sb, desc, blk_free_count); + mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); + mb_free_blocks(NULL, &e4b, bit, cluster_count); + free_clusters_count = clusters_freed + + ext4_free_group_clusters(sb, desc); + ext4_free_group_clusters_set(sb, desc, free_clusters_count); ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_NUM_B2C(sbi, blocks_freed)); + clusters_freed); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + atomic64_add(clusters_freed, &sbi->s_flex_groups[flex_group].free_clusters); } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 009300ee1561..dcf52540f379 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/ext4/mballoc.h * diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 84c54f15f1dd..27b9a76a0dfa 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/random.h> #include <linux/buffer_head.h> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c1cf020d1889..798b3ac680db 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/namei.c * @@ -1538,24 +1539,14 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct inode *inode; struct ext4_dir_entry_2 *de; struct buffer_head *bh; + int err; - if (ext4_encrypted_inode(dir)) { - int res = fscrypt_get_encryption_info(dir); - - /* - * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is - * created while the directory was encrypted and we - * have access to the key. - */ - if (fscrypt_has_encryption_key(dir)) - fscrypt_set_encrypted_dentry(dentry); - fscrypt_set_d_op(dentry); - if (res && res != -ENOKEY) - return ERR_PTR(res); - } + err = fscrypt_prepare_lookup(dir, dentry, flags); + if (err) + return ERR_PTR(err); - if (dentry->d_name.len > EXT4_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (IS_ERR(bh)) @@ -3221,9 +3212,10 @@ static int ext4_link(struct dentry *old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; - if (ext4_encrypted_inode(dir) && - !fscrypt_has_permitted_context(dir, inode)) - return -EPERM; + + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(dir)->i_projid, @@ -3515,12 +3507,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; - if ((ext4_encrypted_inode(old_dir) && - !fscrypt_has_encryption_key(old_dir)) || - (ext4_encrypted_inode(new_dir) && - !fscrypt_has_encryption_key(new_dir))) - return -ENOKEY; - retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3549,13 +3535,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; - if ((old.dir != new.dir) && - ext4_encrypted_inode(new.dir) && - !fscrypt_has_permitted_context(new.dir, old.inode)) { - retval = -EPERM; - goto end_rename; - } - new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { @@ -3721,19 +3700,6 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int retval; struct timespec ctime; - if ((ext4_encrypted_inode(old_dir) && - !fscrypt_has_encryption_key(old_dir)) || - (ext4_encrypted_inode(new_dir) && - !fscrypt_has_encryption_key(new_dir))) - return -ENOKEY; - - if ((ext4_encrypted_inode(old_dir) || - ext4_encrypted_inode(new_dir)) && - (old_dir != new_dir) && - (!fscrypt_has_permitted_context(new_dir, old.inode) || - !fscrypt_has_permitted_context(old_dir, new.inode))) - return -EPERM; - if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid)) || @@ -3860,12 +3826,19 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int err; + if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb)))) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + if (flags & RENAME_EXCHANGE) { return ext4_cross_rename(old_dir, old_dentry, new_dir, new_dentry); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 55ad7dd149d0..db7590178dfc 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/page-io.c * diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 04c90643af7a..9ffa6fad18db 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/readpage.c * diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 035cd3f4785e..50443bda8e98 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/resize.c * @@ -106,7 +107,7 @@ static int verify_group_input(struct super_block *sb, overhead = ext4_group_overhead_blocks(sb, group); metaend = start + overhead; - input->free_blocks_count = free_blocks_count = + input->free_clusters_count = free_blocks_count = input->blocks_count - 2 - overhead - sbi->s_itb_per_group; if (test_opt(sb, DEBUG)) @@ -257,6 +258,7 @@ static int ext4_alloc_group_tables(struct super_block *sb, ext4_group_t last_group; unsigned overhead; __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; + int i; BUG_ON(flex_gd->count == 0 || group_data == NULL); @@ -293,7 +295,7 @@ next_group: group_data[bb_index].block_bitmap = start_blk++; group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; - group_data[group].free_blocks_count--; + group_data[group].mdata_blocks++; flex_gd->bg_flags[group] &= uninit_mask; } @@ -304,7 +306,7 @@ next_group: group_data[ib_index].inode_bitmap = start_blk++; group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; - group_data[group].free_blocks_count--; + group_data[group].mdata_blocks++; flex_gd->bg_flags[group] &= uninit_mask; } @@ -323,15 +325,22 @@ next_group: if (start_blk + itb > next_group_start) { flex_gd->bg_flags[group + 1] &= uninit_mask; overhead = start_blk + itb - next_group_start; - group_data[group + 1].free_blocks_count -= overhead; + group_data[group + 1].mdata_blocks += overhead; itb -= overhead; } - group_data[group].free_blocks_count -= itb; + group_data[group].mdata_blocks += itb; flex_gd->bg_flags[group] &= uninit_mask; start_blk += EXT4_SB(sb)->s_itb_per_group; } + /* Update free clusters count to exclude metadata blocks */ + for (i = 0; i < flex_gd->count; i++) { + group_data[i].free_clusters_count -= + EXT4_NUM_B2C(EXT4_SB(sb), + group_data[i].mdata_blocks); + } + if (test_opt(sb, DEBUG)) { int i; group = group_data[0].group; @@ -341,12 +350,13 @@ next_group: flexbg_size); for (i = 0; i < flex_gd->count; i++) { - printk(KERN_DEBUG "adding %s group %u: %u " - "blocks (%d free)\n", + ext4_debug( + "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n", ext4_bg_has_super(sb, group + i) ? "normal" : "no-super", group + i, group_data[i].blocks_count, - group_data[i].free_blocks_count); + group_data[i].free_clusters_count, + group_data[i].mdata_blocks); } } return 0; @@ -398,7 +408,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh) } /* - * set_flexbg_block_bitmap() mark @count blocks starting from @block used. + * set_flexbg_block_bitmap() mark clusters [@first_cluster, @last_cluster] used. * * Helper function for ext4_setup_new_group_blocks() which set . * @@ -408,22 +418,26 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh) */ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, struct ext4_new_flex_group_data *flex_gd, - ext4_fsblk_t block, ext4_group_t count) + ext4_fsblk_t first_cluster, ext4_fsblk_t last_cluster) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t count = last_cluster - first_cluster + 1; ext4_group_t count2; - ext4_debug("mark blocks [%llu/%u] used\n", block, count); - for (count2 = count; count > 0; count -= count2, block += count2) { + ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, + last_cluster); + for (count2 = count; count > 0; + count -= count2, first_cluster += count2) { ext4_fsblk_t start; struct buffer_head *bh; ext4_group_t group; int err; - group = ext4_get_group_number(sb, block); - start = ext4_group_first_block_no(sb, group); + group = ext4_get_group_number(sb, EXT4_C2B(sbi, first_cluster)); + start = EXT4_B2C(sbi, ext4_group_first_block_no(sb, group)); group -= flex_gd->groups[0].group; - count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start); + count2 = EXT4_CLUSTERS_PER_GROUP(sb) - (first_cluster - start); if (count2 > count) count2 = count; @@ -444,9 +458,9 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, err = ext4_journal_get_write_access(handle, bh); if (err) return err; - ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, - block - start, count2); - ext4_set_bits(bh->b_data, block - start, count2); + ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", + first_cluster, first_cluster - start, count2); + ext4_set_bits(bh->b_data, first_cluster - start, count2); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (unlikely(err)) @@ -595,9 +609,10 @@ handle_bb: if (overhead != 0) { ext4_debug("mark backup superblock %#04llx (+0)\n", start); - ext4_set_bits(bh->b_data, 0, overhead); + ext4_set_bits(bh->b_data, 0, + EXT4_NUM_B2C(sbi, overhead)); } - ext4_mark_bitmap_end(group_data[i].blocks_count, + ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count), sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) @@ -642,7 +657,11 @@ handle_ib: continue; } err = set_flexbg_block_bitmap(sb, handle, - flex_gd, start, count); + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); if (err) goto out; count = group_table_count[j]; @@ -652,7 +671,11 @@ handle_ib: if (count) { err = set_flexbg_block_bitmap(sb, handle, - flex_gd, start, count); + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); if (err) goto out; } @@ -840,7 +863,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_std_error(sb, err); goto exit_inode; } - inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> + (9 - EXT4_SB(sb)->s_cluster_bits); ext4_mark_iloc_dirty(handle, inode, &iloc); memset(gdb_bh->b_data, 0, sb->s_blocksize); err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); @@ -935,6 +959,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, { struct super_block *sb = inode->i_sb; int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + int cluster_bits = EXT4_SB(sb)->s_cluster_bits; struct buffer_head **primary; struct buffer_head *dind; struct ext4_iloc iloc; @@ -1010,7 +1035,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, if (!err) err = err2; } - inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + + inode->i_blocks += reserved_gdb * sb->s_blocksize >> (9 - cluster_bits); ext4_mark_iloc_dirty(handle, inode, &iloc); exit_bh: @@ -1244,7 +1270,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, ext4_group_t group; __u16 *bg_flags = flex_gd->bg_flags; int i, gdb_off, gdb_num, err = 0; - + for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { group = group_data->group; @@ -1271,7 +1297,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, ext4_inode_table_set(sb, gdp, group_data->inode_table); ext4_free_group_clusters_set(sb, gdp, - EXT4_NUM_B2C(sbi, group_data->free_blocks_count)); + group_data->free_clusters_count); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); if (ext4_has_group_desc_csum(sb)) ext4_itable_unused_set(sb, gdp, @@ -1327,7 +1353,7 @@ static void ext4_update_super(struct super_block *sb, */ for (i = 0; i < flex_gd->count; i++) { blocks_count += group_data[i].blocks_count; - free_blocks += group_data[i].free_blocks_count; + free_blocks += EXT4_C2B(sbi, group_data[i].free_clusters_count); } reserved_blocks = ext4_r_blocks_count(es) * 100; @@ -1499,17 +1525,18 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, ext4_fsblk_t n_blocks_count, unsigned long flexbg_size) { - struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; struct ext4_new_group_data *group_data = flex_gd->groups; ext4_fsblk_t o_blocks_count; ext4_group_t n_group; ext4_group_t group; ext4_group_t last_group; ext4_grpblk_t last; - ext4_grpblk_t blocks_per_group; + ext4_grpblk_t clusters_per_group; unsigned long i; - blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb); + clusters_per_group = EXT4_CLUSTERS_PER_GROUP(sb); o_blocks_count = ext4_blocks_count(es); @@ -1530,9 +1557,10 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, int overhead; group_data[i].group = group + i; - group_data[i].blocks_count = blocks_per_group; + group_data[i].blocks_count = EXT4_BLOCKS_PER_GROUP(sb); overhead = ext4_group_overhead_blocks(sb, group + i); - group_data[i].free_blocks_count = blocks_per_group - overhead; + group_data[i].mdata_blocks = overhead; + group_data[i].free_clusters_count = EXT4_CLUSTERS_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) { flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | EXT4_BG_INODE_UNINIT; @@ -1546,10 +1574,10 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, /* We need to initialize block bitmap of last group. */ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; - if ((last_group == n_group) && (last != blocks_per_group - 1)) { - group_data[i - 1].blocks_count = last + 1; - group_data[i - 1].free_blocks_count -= blocks_per_group- - last - 1; + if ((last_group == n_group) && (last != clusters_per_group - 1)) { + group_data[i - 1].blocks_count = EXT4_C2B(sbi, last + 1); + group_data[i - 1].free_clusters_count -= clusters_per_group - + last - 1; } return 1; @@ -1796,7 +1824,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) } /* Do a quick sanity check of the resize inode */ - if (inode->i_blocks != 1 << (inode->i_blkbits - 9)) + if (inode->i_blocks != 1 << (inode->i_blkbits - + (9 - sbi->s_cluster_bits))) goto invalid_resize_inode; for (i = 0; i < EXT4_N_BLOCKS; i++) { if (i == EXT4_DIND_BLOCK) { @@ -1959,7 +1988,7 @@ retry: if (n_group == o_group) add = n_blocks_count - o_blocks_count; else - add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); + add = EXT4_C2B(sbi, EXT4_CLUSTERS_PER_GROUP(sb) - (offset + 1)); if (add > 0) { err = ext4_group_extend_no_check(sb, o_blocks_count, add); if (err) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b104096fce9e..0556cd036b69 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1159,6 +1159,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (inode->i_ino == EXT4_ROOT_INO) return -EPERM; + if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) + return -EINVAL; + res = ext4_convert_inline_data(inode); if (res) return res; @@ -1181,7 +1184,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); /* - * Update inode->i_flags - e.g. S_DAX may get disabled + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled */ ext4_set_inode_flags(inode); } @@ -1206,7 +1210,10 @@ retry: ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - /* Update inode->i_flags - e.g. S_DAX may get disabled */ + /* + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled + */ ext4_set_inode_flags(inode); res = ext4_mark_inode_dirty(handle, inode); if (res) @@ -1237,14 +1244,9 @@ static const struct fscrypt_operations ext4_cryptops = { .get_context = ext4_get_context, .set_context = ext4_set_context, .dummy_context = ext4_dummy_context, - .is_encrypted = ext4_encrypted_inode, .empty_dir = ext4_empty_dir, .max_namelen = ext4_max_namelen, }; -#else -static const struct fscrypt_operations ext4_cryptops = { - .is_encrypted = ext4_encrypted_inode, -}; #endif #ifdef CONFIG_QUOTA @@ -1677,7 +1679,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; return 1; case Opt_i_version: - sb->s_flags |= MS_I_VERSION; + sb->s_flags |= SB_I_VERSION; return 1; case Opt_lazytime: sb->s_flags |= MS_LAZYTIME; @@ -2060,7 +2062,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); - if (sb->s_flags & MS_I_VERSION) + if (sb->s_flags & SB_I_VERSION) SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); @@ -2791,14 +2793,11 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) * This function is called once a day if we have errors logged * on the file system */ -static void print_daily_error_info(unsigned long arg) +static void print_daily_error_info(struct timer_list *t) { - struct super_block *sb = (struct super_block *) arg; - struct ext4_sb_info *sbi; - struct ext4_super_block *es; - - sbi = EXT4_SB(sb); - es = sbi->s_es; + struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); + struct super_block *sb = sbi->s_sb; + struct ext4_super_block *es = sbi->s_es; if (es->s_error_count) /* fsck newer than v1.41.13 is needed to clean this condition. */ @@ -3708,6 +3707,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (ext4_has_feature_inline_data(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" + " that may contain inline data"); + goto failed_mount; + } err = bdev_dax_supported(sb, blocksize); if (err) goto failed_mount; @@ -3977,11 +3981,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_gdb_count = db_count; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); - spin_lock_init(&sbi->s_next_gen_lock); - setup_timer(&sbi->s_err_report, print_daily_error_info, - (unsigned long) sb); + timer_setup(&sbi->s_err_report, print_daily_error_info, 0); /* Register extent status tree shrinker */ if (ext4_es_register_shrinker(sbi)) @@ -3996,7 +3997,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; +#ifdef CONFIG_EXT4_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; +#endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) @@ -4612,7 +4615,8 @@ static int ext4_load_journal(struct super_block *sb, "required on readonly filesystem"); if (really_read_only) { ext4_msg(sb, KERN_ERR, "write access " - "unavailable, cannot proceed"); + "unavailable, cannot proceed " + "(try mounting with noload)"); return -EROFS; } ext4_msg(sb, KERN_INFO, "write access will " diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 5c8fc53cb0e5..a2006c9af1d9 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/symlink.c * diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 48c7a7d55ed3..e21afd52e7d7 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/sysfs.c * diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index c70d06a383e2..b64a9fa0ff41 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/ext4/truncate.h * diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3b69330a4250..218a7ba57819 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr.c * diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 0d2dde1fa87a..f8cc07588ac9 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* File: fs/ext4/xattr.h diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index a8921112030d..629001b28632 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr_security.c * Handler for storing security labels as extended attributes. diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index c7765c735714..e9389e5d75c3 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr_trusted.c * Handler for trusted extended attributes. diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index ca20e423034b..d4546184b34b 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr_user.c * Handler for extended user attributes. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index a0dc559b1b47..776c4b936504 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4b4a72f392be..115204fdefcc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -23,13 +23,11 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/quotaops.h> -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#include <linux/fscrypt_supp.h> -#else -#include <linux/fscrypt_notsupp.h> -#endif #include <crypto/hash.h> +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION) +#include <linux/fscrypt.h> + #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else @@ -2949,6 +2947,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION file_set_encrypt(inode); + inode->i_flags |= S_ENCRYPTED; #endif } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 50c88e37ed66..53fb08810ee9 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -43,8 +43,11 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; + if (f2fs_encrypted_inode(inode)) + new_fl |= S_ENCRYPTED; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| + S_ENCRYPTED); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 933c3d529e65..97e03c637e90 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1594,14 +1594,9 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; -#else -static const struct fscrypt_operations f2fs_cryptops = { - .is_encrypted = f2fs_encrypted_inode, -}; #endif static struct inode *f2fs_nfs_get_inode(struct super_block *sb, @@ -2320,7 +2315,9 @@ try_onemore: #endif sb->s_op = &f2fs_sops; +#ifdef CONFIG_F2FS_FS_ENCRYPTION sb->s_cop = &f2fs_cryptops; +#endif sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; diff --git a/fs/fat/Makefile b/fs/fat/Makefile index 964b634f6667..70645ce2f7fc 100644 --- a/fs/fat/Makefile +++ b/fs/fat/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux fat filesystem support. # diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 5d384921524d..e9bed49df6b7 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/fat/cache.c * diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 051dac1ce3be..8fc1093da47d 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FAT_H #define _FAT_H diff --git a/fs/fcntl.c b/fs/fcntl.c index 448a1119f0be..30f47d0f74a0 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/fcntl.c * @@ -724,7 +725,7 @@ static void send_sigio_to_task(struct task_struct *p, * F_SETSIG can change ->signum lockless in parallel, make * sure we read it once and use the same value throughout. */ - int signum = ACCESS_ONCE(fown->signum); + int signum = READ_ONCE(fown->signum); if (!sigio_perm(p, fown, signum)) return; diff --git a/fs/fhandle.c b/fs/fhandle.c index 58a61f55e0d0..474adc8d2a3a 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/fs.h> diff --git a/fs/file.c b/fs/file.c index 1fc7fbbb4510..4eecbf4244a5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * diff --git a/fs/file_table.c b/fs/file_table.c index 61517f57f8ef..49e1f2f1a4cb 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -201,11 +201,11 @@ static void __fput(struct file *file) eventpoll_release(file); locks_remove_file(file); + ima_file_free(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } - ima_file_free(file); if (file->f_op->release) file->f_op->release(inode, file); security_file_free(file); diff --git a/fs/filesystems.c b/fs/filesystems.c index a920ad2629ac..f2728a4a03a1 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/filesystems.c * diff --git a/fs/fs_pin.c b/fs/fs_pin.c index e747b3d720ee..a6497cf8ae53 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/sched.h> #include <linux/slab.h> @@ -78,7 +79,7 @@ void mnt_pin_kill(struct mount *m) while (1) { struct hlist_node *p; rcu_read_lock(); - p = ACCESS_ONCE(m->mnt_pins.first); + p = READ_ONCE(m->mnt_pins.first); if (!p) { rcu_read_unlock(); break; @@ -92,7 +93,7 @@ void group_pin_kill(struct hlist_head *p) while (1) { struct hlist_node *q; rcu_read_lock(); - q = ACCESS_ONCE(p->first); + q = READ_ONCE(p->first); if (!q) { rcu_read_unlock(); break; diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile index 6d561531cb36..79e08e05ef84 100644 --- a/fs/fscache/Makefile +++ b/fs/fscache/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for general filesystem caching code # diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index b5ab06fabc60..0438d4cd91ef 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -331,6 +331,13 @@ static void fscache_objlist_config(struct fscache_objlist_data *data) rcu_read_lock(); confkey = user_key_payload_rcu(key); + if (!confkey) { + /* key was revoked */ + rcu_read_unlock(); + key_put(key); + goto no_config; + } + buf = confkey->data; for (len = confkey->datalen - 1; len >= 0; len--) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 13c65dd2d37d..a42d89371748 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -33,7 +33,7 @@ static struct fuse_dev *fuse_get_dev(struct file *file) * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return ACCESS_ONCE(file->private_data); + return READ_ONCE(file->private_data); } static void fuse_request_init(struct fuse_req *req, struct page **pages, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 622081b97426..24967382a7b1 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1308,7 +1308,8 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, */ over = !dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, dirent->type); - ctx->pos = dirent->off; + if (!over) + ctx->pos = dirent->off; } buf += reclen; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 65c88379a3a1..94a745acaef8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1059,7 +1059,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; - sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); + sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION); if (!parse_fuse_opt(data, &d, is_bdev)) goto err; diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 90c6a8faaecb..43c827a7cce5 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -4,6 +4,7 @@ config GFS2_FS select FS_POSIX_ACL select CRC32 select QUOTACTL + select FS_IOMAP help A cluster filesystem. diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 86128202384f..41b2aa4bc3bf 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 ccflags-y := -I$(src) obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 9d5eecb123de..776717f1eeea 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -141,6 +141,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) ret = __gfs2_set_acl(inode, acl, type); if (!ret && mode != inode->i_mode) { + inode->i_ctime = current_time(inode); inode->i_mode = mode; mark_inode_dirty(inode); } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 3dd0cceefa43..d5f0d96169c5 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -13,6 +13,7 @@ #include <linux/blkdev.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> +#include <linux/iomap.h> #include "gfs2.h" #include "incore.h" @@ -36,6 +37,8 @@ struct metapath { struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT]; __u16 mp_list[GFS2_MAX_META_HEIGHT]; + int mp_fheight; /* find_metapath height */ + int mp_aheight; /* actual height (lookup height) */ }; /** @@ -235,9 +238,9 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block, { unsigned int i; + mp->mp_fheight = height; for (i = height; i--;) mp->mp_list[i] = do_div(block, sdp->sd_inptrs); - } static inline unsigned int metapath_branch_start(const struct metapath *mp) @@ -248,7 +251,7 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp) } /** - * metaptr1 - Return the first possible metadata pointer in a metaath buffer + * metaptr1 - Return the first possible metadata pointer in a metapath buffer * @height: The metadata height (0 = dinode) * @mp: The metapath */ @@ -345,10 +348,13 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) for (x = 0; x < end_of_metadata; x++) { ret = lookup_mp_height(ip, mp, x); if (ret) - return ret; + goto out; } - return ip->i_height; + ret = ip->i_height; +out: + mp->mp_aheight = ret; + return ret; } /** @@ -480,10 +486,11 @@ static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt) * @inode: The GFS2 inode * @lblock: The logical starting block of the extent * @bh_map: This is used to return the mapping details - * @mp: The metapath - * @sheight: The starting height (i.e. whats already mapped) - * @height: The height to build to + * @zero_new: True if newly allocated blocks should be zeroed + * @mp: The metapath, with proper height information calculated * @maxlen: The max number of data blocks to alloc + * @dblock: Pointer to return the resulting new block + * @dblks: Pointer to return the number of blocks allocated * * In this routine we may have to alloc: * i) Indirect blocks to grow the metadata tree height @@ -499,63 +506,63 @@ static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt) * Returns: errno on error */ -static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, - struct buffer_head *bh_map, struct metapath *mp, - const unsigned int sheight, - const unsigned int height, - const size_t maxlen) +static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, + unsigned flags, struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct super_block *sb = sdp->sd_vfs; struct buffer_head *dibh = mp->mp_bh[0]; - u64 bn, dblock = 0; + u64 bn; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; unsigned dblks = 0; unsigned ptrs_per_blk; - const unsigned end_of_metadata = height - 1; + const unsigned end_of_metadata = mp->mp_fheight - 1; int ret; - int eob = 0; enum alloc_state state; __be64 *ptr; __be64 zero_bn = 0; + size_t maxlen = iomap->length >> inode->i_blkbits; - BUG_ON(sheight < 1); + BUG_ON(mp->mp_aheight < 1); BUG_ON(dibh == NULL); gfs2_trans_add_meta(ip->i_gl, dibh); - if (height == sheight) { + if (mp->mp_fheight == mp->mp_aheight) { struct buffer_head *bh; + int eob; + /* Bottom indirect block exists, find unalloced extent size */ ptr = metapointer(end_of_metadata, mp); bh = mp->mp_bh[end_of_metadata]; - dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, - &eob); + dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, + maxlen, &eob); BUG_ON(dblks < 1); state = ALLOC_DATA; } else { /* Need to allocate indirect blocks */ - ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; + ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs : + sdp->sd_diptrs; dblks = min(maxlen, (size_t)(ptrs_per_blk - mp->mp_list[end_of_metadata])); - if (height == ip->i_height) { + if (mp->mp_fheight == ip->i_height) { /* Writing into existing tree, extend tree down */ - iblks = height - sheight; + iblks = mp->mp_fheight - mp->mp_aheight; state = ALLOC_GROW_DEPTH; } else { /* Building up tree height */ state = ALLOC_GROW_HEIGHT; - iblks = height - ip->i_height; + iblks = mp->mp_fheight - ip->i_height; branch_start = metapath_branch_start(mp); - iblks += (height - branch_start); + iblks += (mp->mp_fheight - branch_start); } } /* start of the second part of the function (state machine) */ blks = dblks + iblks; - i = sheight; + i = mp->mp_aheight; do { int error; n = blks - alloced; @@ -573,9 +580,10 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, sizeof(struct gfs2_dinode)); zero_bn = *ptr; } - for (; i - 1 < height - ip->i_height && n > 0; i++, n--) + for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0; + i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++); - if (i - 1 == height - ip->i_height) { + if (i - 1 == mp->mp_fheight - ip->i_height) { i--; gfs2_buffer_copy_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header), @@ -587,7 +595,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, sizeof(struct gfs2_meta_header)); *ptr = zero_bn; state = ALLOC_GROW_DEPTH; - for(i = branch_start; i < height; i++) { + for(i = branch_start; i < mp->mp_fheight; i++) { if (mp->mp_bh[i] == NULL) break; brelse(mp->mp_bh[i]); @@ -599,12 +607,12 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, break; /* Branching from existing tree */ case ALLOC_GROW_DEPTH: - if (i > 1 && i < height) + if (i > 1 && i < mp->mp_fheight) gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); - for (; i < height && n > 0; i++, n--) + for (; i < mp->mp_fheight && n > 0; i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, mp->mp_list[i-1], bn++); - if (i == height) + if (i == mp->mp_fheight) state = ALLOC_DATA; if (n == 0) break; @@ -615,119 +623,269 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]); dblks = n; ptr = metapointer(end_of_metadata, mp); - dblock = bn; + iomap->addr = bn << inode->i_blkbits; + iomap->flags |= IOMAP_F_NEW; while (n-- > 0) *ptr++ = cpu_to_be64(bn++); - if (buffer_zeronew(bh_map)) { - ret = sb_issue_zeroout(sb, dblock, dblks, - GFP_NOFS); + if (flags & IOMAP_ZERO) { + ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits, + dblks, GFP_NOFS); if (ret) { fs_err(sdp, "Failed to zero data buffers\n"); - clear_buffer_zeronew(bh_map); + flags &= ~IOMAP_ZERO; } } break; } - } while ((state != ALLOC_DATA) || !dblock); + } while (iomap->addr == IOMAP_NULL_ADDR); - ip->i_height = height; + iomap->length = (u64)dblks << inode->i_blkbits; + ip->i_height = mp->mp_fheight; gfs2_add_inode_blocks(&ip->i_inode, alloced); gfs2_dinode_out(ip, mp->mp_bh[0]->b_data); - map_bh(bh_map, inode->i_sb, dblock); - bh_map->b_size = dblks << inode->i_blkbits; - set_buffer_new(bh_map); return 0; } /** - * gfs2_block_map - Map a block from an inode to a disk block + * hole_size - figure out the size of a hole * @inode: The inode - * @lblock: The logical block number - * @bh_map: The bh to be mapped - * @create: True if its ok to alloc blocks to satify the request + * @lblock: The logical starting block number + * @mp: The metapath * - * Sets buffer_mapped() if successful, sets buffer_boundary() if a - * read of metadata will be required before the next block can be - * mapped. Sets buffer_new() if new blocks were allocated. + * Returns: The hole size in bytes * - * Returns: errno */ +static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct metapath mp_eof; + u64 factor = 1; + int hgt; + u64 holesz = 0; + const __be64 *first, *end, *ptr; + const struct buffer_head *bh; + u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits; + int zeroptrs; + bool done = false; + + /* Get another metapath, to the very last byte */ + find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height); + for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) { + bh = mp->mp_bh[hgt]; + if (bh) { + zeroptrs = 0; + first = metapointer(hgt, mp); + end = (const __be64 *)(bh->b_data + bh->b_size); + + for (ptr = first; ptr < end; ptr++) { + if (*ptr) { + done = true; + break; + } else { + zeroptrs++; + } + } + } else { + zeroptrs = sdp->sd_inptrs; + } + if (factor * zeroptrs >= lblock_stop - lblock + 1) { + holesz = lblock_stop - lblock + 1; + break; + } + holesz += factor * zeroptrs; -int gfs2_block_map(struct inode *inode, sector_t lblock, - struct buffer_head *bh_map, int create) + factor *= sdp->sd_inptrs; + if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1])) + (mp->mp_list[hgt - 1])++; + } + return holesz << inode->i_blkbits; +} + +static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + iomap->addr = (ip->i_no_addr << inode->i_blkbits) + + sizeof(struct gfs2_dinode); + iomap->offset = 0; + iomap->length = i_size_read(inode); + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_DATA_INLINE; +} + +/** + * gfs2_iomap_begin - Map blocks from an inode to disk blocks + * @inode: The inode + * @pos: Starting position in bytes + * @length: Length to map, in bytes + * @flags: iomap flags + * @iomap: The iomap structure + * + * Returns: errno + */ +int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - unsigned int bsize = sdp->sd_sb.sb_bsize; - const size_t maxlen = bh_map->b_size >> inode->i_blkbits; + struct metapath mp = { .mp_aheight = 1, }; + unsigned int factor = sdp->sd_sb.sb_bsize; const u64 *arr = sdp->sd_heightsize; __be64 *ptr; - u64 size; - struct metapath mp; + sector_t lblock; + sector_t lend; int ret; int eob; unsigned int len; struct buffer_head *bh; u8 height; - BUG_ON(maxlen == 0); + trace_gfs2_iomap_start(ip, pos, length, flags); + if (!length) { + ret = -EINVAL; + goto out; + } - memset(&mp, 0, sizeof(mp)); - bmap_lock(ip, create); - clear_buffer_mapped(bh_map); - clear_buffer_new(bh_map); - clear_buffer_boundary(bh_map); - trace_gfs2_bmap(ip, bh_map, lblock, create, 1); + if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) { + gfs2_stuffed_iomap(inode, iomap); + if (pos >= iomap->length) + return -ENOENT; + ret = 0; + goto out; + } + + lblock = pos >> inode->i_blkbits; + lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits; + + iomap->offset = lblock << inode->i_blkbits; + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + iomap->length = (u64)(lend - lblock) << inode->i_blkbits; + iomap->flags = IOMAP_F_MERGED; + bmap_lock(ip, 0); + + /* + * Directory data blocks have a struct gfs2_meta_header header, so the + * remaining size is smaller than the filesystem block size. Logical + * block numbers for directories are in units of this remaining size! + */ if (gfs2_is_dir(ip)) { - bsize = sdp->sd_jbsize; + factor = sdp->sd_jbsize; arr = sdp->sd_jheightsize; } ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]); if (ret) - goto out; + goto out_release; height = ip->i_height; - size = (lblock + 1) * bsize; - while (size > arr[height]) + while ((lblock + 1) * factor > arr[height]) height++; find_metapath(sdp, lblock, &mp, height); - ret = 1; if (height > ip->i_height || gfs2_is_stuffed(ip)) goto do_alloc; + ret = lookup_metapath(ip, &mp); if (ret < 0) - goto out; - if (ret != ip->i_height) + goto out_release; + + if (mp.mp_aheight != ip->i_height) goto do_alloc; + ptr = metapointer(ip->i_height - 1, &mp); if (*ptr == 0) goto do_alloc; - map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr)); + + iomap->type = IOMAP_MAPPED; + iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; + bh = mp.mp_bh[ip->i_height - 1]; - len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob); - bh_map->b_size = (len << inode->i_blkbits); + len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob); if (eob) - set_buffer_boundary(bh_map); + iomap->flags |= IOMAP_F_BOUNDARY; + iomap->length = (u64)len << inode->i_blkbits; + ret = 0; -out: + +out_release: release_metapath(&mp); - trace_gfs2_bmap(ip, bh_map, lblock, create, ret); - bmap_unlock(ip, create); + bmap_unlock(ip, 0); +out: + trace_gfs2_iomap_end(ip, iomap, ret); return ret; do_alloc: - /* All allocations are done here, firstly check create flag */ - if (!create) { - BUG_ON(gfs2_is_stuffed(ip)); + if (!(flags & IOMAP_WRITE)) { + if (pos >= i_size_read(inode)) { + ret = -ENOENT; + goto out_release; + } ret = 0; + iomap->length = hole_size(inode, lblock, &mp); + goto out_release; + } + + ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); + goto out_release; +} + +/** + * gfs2_block_map - Map a block from an inode to a disk block + * @inode: The inode + * @lblock: The logical block number + * @bh_map: The bh to be mapped + * @create: True if its ok to alloc blocks to satify the request + * + * Sets buffer_mapped() if successful, sets buffer_boundary() if a + * read of metadata will be required before the next block can be + * mapped. Sets buffer_new() if new blocks were allocated. + * + * Returns: errno + */ + +int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh_map, int create) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct iomap iomap; + int ret, flags = 0; + + clear_buffer_mapped(bh_map); + clear_buffer_new(bh_map); + clear_buffer_boundary(bh_map); + trace_gfs2_bmap(ip, bh_map, lblock, create, 1); + + if (create) + flags |= IOMAP_WRITE; + if (buffer_zeronew(bh_map)) + flags |= IOMAP_ZERO; + ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits, + bh_map->b_size, flags, &iomap); + if (ret) { + if (!create && ret == -ENOENT) { + /* Return unmapped buffer beyond the end of file. */ + ret = 0; + } goto out; } - /* At this point ret is the tree depth of already allocated blocks */ - ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen); - goto out; + if (iomap.length > bh_map->b_size) { + iomap.length = bh_map->b_size; + iomap.flags &= ~IOMAP_F_BOUNDARY; + } + if (iomap.addr != IOMAP_NULL_ADDR) + map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits); + bh_map->b_size = iomap.length; + if (iomap.flags & IOMAP_F_BOUNDARY) + set_buffer_boundary(bh_map); + if (iomap.flags & IOMAP_F_NEW) + set_buffer_new(bh_map); + +out: + trace_gfs2_bmap(ip, bh_map, lblock, create, ret); + return ret; } /* diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 81ded5e2aaa2..443cc182cf18 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -10,6 +10,8 @@ #ifndef __BMAP_DOT_H__ #define __BMAP_DOT_H__ +#include <linux/iomap.h> + #include "inode.h" struct inode; @@ -47,6 +49,8 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); extern int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); +extern int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap); extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); extern int gfs2_setattr_size(struct inode *inode, u64 size); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 33a0cb5701a3..58705ef8643a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -60,9 +60,7 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) loff_t error; switch (whence) { - case SEEK_END: /* These reference inode->i_size */ - case SEEK_DATA: - case SEEK_HOLE: + case SEEK_END: error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { @@ -70,8 +68,21 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) gfs2_glock_dq_uninit(&i_gh); } break; + + case SEEK_DATA: + error = gfs2_seek_data(file, offset); + break; + + case SEEK_HOLE: + error = gfs2_seek_hole(file, offset); + break; + case SEEK_CUR: case SEEK_SET: + /* + * These don't reference inode->i_size and don't depend on the + * block mapping, so we don't need the glock. + */ error = generic_file_llseek(file, offset, whence); break; default: @@ -108,45 +119,22 @@ static int gfs2_readdir(struct file *file, struct dir_context *ctx) } /** - * fsflags_cvt - * @table: A table of 32 u32 flags - * @val: a 32 bit value to convert - * - * This function can be used to convert between fsflags values and - * GFS2's own flags values. + * fsflag_gfs2flag * - * Returns: the converted flags + * The FS_JOURNAL_DATA_FL flag maps to GFS2_DIF_INHERIT_JDATA for directories, + * and to GFS2_DIF_JDATA for non-directories. */ -static u32 fsflags_cvt(const u32 *table, u32 val) -{ - u32 res = 0; - while(val) { - if (val & 1) - res |= *table; - table++; - val >>= 1; - } - return res; -} - -static const u32 fsflags_to_gfs2[32] = { - [3] = GFS2_DIF_SYNC, - [4] = GFS2_DIF_IMMUTABLE, - [5] = GFS2_DIF_APPENDONLY, - [7] = GFS2_DIF_NOATIME, - [12] = GFS2_DIF_EXHASH, - [14] = GFS2_DIF_INHERIT_JDATA, - [17] = GFS2_DIF_TOPDIR, -}; - -static const u32 gfs2_to_fsflags[32] = { - [gfs2fl_Sync] = FS_SYNC_FL, - [gfs2fl_Immutable] = FS_IMMUTABLE_FL, - [gfs2fl_AppendOnly] = FS_APPEND_FL, - [gfs2fl_NoAtime] = FS_NOATIME_FL, - [gfs2fl_ExHash] = FS_INDEX_FL, - [gfs2fl_TopLevel] = FS_TOPDIR_FL, - [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, +static struct { + u32 fsflag; + u32 gfsflag; +} fsflag_gfs2flag[] = { + {FS_SYNC_FL, GFS2_DIF_SYNC}, + {FS_IMMUTABLE_FL, GFS2_DIF_IMMUTABLE}, + {FS_APPEND_FL, GFS2_DIF_APPENDONLY}, + {FS_NOATIME_FL, GFS2_DIF_NOATIME}, + {FS_INDEX_FL, GFS2_DIF_EXHASH}, + {FS_TOPDIR_FL, GFS2_DIF_TOPDIR}, + {FS_JOURNAL_DATA_FL, GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA}, }; static int gfs2_get_flags(struct file *filp, u32 __user *ptr) @@ -154,17 +142,23 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) struct inode *inode = file_inode(filp); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; - int error; - u32 fsflags; + int i, error; + u32 gfsflags, fsflags = 0; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); error = gfs2_glock_nq(&gh); if (error) goto out_uninit; - fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); - if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) - fsflags |= FS_JOURNAL_DATA_FL; + gfsflags = ip->i_diskflags; + if (S_ISDIR(inode->i_mode)) + gfsflags &= ~GFS2_DIF_JDATA; + else + gfsflags &= ~GFS2_DIF_INHERIT_JDATA; + for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) + if (gfsflags & fsflag_gfs2flag[i].gfsflag) + fsflags |= fsflag_gfs2flag[i].fsflag; + if (put_user(fsflags, ptr)) error = -EFAULT; @@ -199,7 +193,6 @@ void gfs2_set_inode_flags(struct inode *inode) GFS2_DIF_APPENDONLY| \ GFS2_DIF_NOATIME| \ GFS2_DIF_SYNC| \ - GFS2_DIF_SYSTEM| \ GFS2_DIF_TOPDIR| \ GFS2_DIF_INHERIT_JDATA) @@ -238,10 +231,6 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if ((new_flags ^ flags) == 0) goto out; - error = -EINVAL; - if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) - goto out; - error = -EPERM; if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) goto out; @@ -256,7 +245,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) goto out; } if ((flags ^ new_flags) & GFS2_DIF_JDATA) { - if (flags & GFS2_DIF_JDATA) + if (new_flags & GFS2_DIF_JDATA) gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); error = filemap_fdatawrite(inode->i_mapping); if (error) @@ -264,6 +253,8 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) error = filemap_fdatawait(inode->i_mapping); if (error) goto out; + if (new_flags & GFS2_DIF_JDATA) + gfs2_ordered_del_inode(ip); } error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) @@ -271,6 +262,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_trans_end; + inode->i_ctime = current_time(inode); gfs2_trans_add_meta(ip->i_gl, bh); ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); @@ -289,19 +281,33 @@ out_drop_write: static int gfs2_set_flags(struct file *filp, u32 __user *ptr) { struct inode *inode = file_inode(filp); - u32 fsflags, gfsflags; + u32 fsflags, gfsflags = 0; + u32 mask; + int i; if (get_user(fsflags, ptr)) return -EFAULT; - gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); - if (!S_ISDIR(inode->i_mode)) { - gfsflags &= ~GFS2_DIF_TOPDIR; - if (gfsflags & GFS2_DIF_INHERIT_JDATA) - gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); - return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM); + for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) { + if (fsflags & fsflag_gfs2flag[i].fsflag) { + fsflags &= ~fsflag_gfs2flag[i].fsflag; + gfsflags |= fsflag_gfs2flag[i].gfsflag; + } + } + if (fsflags || gfsflags & ~GFS2_FLAGS_USER_SET) + return -EINVAL; + + mask = GFS2_FLAGS_USER_SET; + if (S_ISDIR(inode->i_mode)) { + mask &= ~GFS2_DIF_JDATA; + } else { + /* The GFS2_DIF_TOPDIR flag is only valid for directories. */ + if (gfsflags & GFS2_DIF_TOPDIR) + return -EINVAL; + mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA); } - return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA)); + + return do_gfs2_set_flags(filp, gfsflags, mask); } static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 863749e29bf9..4e971b1c7f92 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -18,7 +18,7 @@ #include <linux/posix_acl.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> -#include <linux/fiemap.h> +#include <linux/iomap.h> #include <linux/security.h> #include <linux/uaccess.h> @@ -189,7 +189,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, gfs2_set_iop(inode); - inode->i_atime.tv_sec = 0; + /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ + inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); inode->i_atime.tv_nsec = 0; unlock_new_inode(inode); @@ -1986,6 +1987,7 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat, struct inode *inode = d_inode(path->dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; + u32 gfsflags; int error; gfs2_holder_mark_uninitialized(&gh); @@ -1995,13 +1997,30 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat, return error; } + gfsflags = ip->i_diskflags; + if (gfsflags & GFS2_DIF_APPENDONLY) + stat->attributes |= STATX_ATTR_APPEND; + if (gfsflags & GFS2_DIF_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + generic_fillattr(inode, stat); + if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); return 0; } +const struct iomap_ops gfs2_iomap_ops = { + .iomap_begin = gfs2_iomap_begin, +}; + static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -2009,41 +2028,59 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct gfs2_holder gh; int ret; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); - if (ret) - return ret; - - inode_lock(inode); + inode_lock_shared(inode); ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) goto out; - if (gfs2_is_stuffed(ip)) { - u64 phys = ip->i_no_addr << inode->i_blkbits; - u64 size = i_size_read(inode); - u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED| - FIEMAP_EXTENT_DATA_INLINE; - phys += sizeof(struct gfs2_dinode); - phys += start; - if (start + len > size) - len = size - start; - if (start < size) - ret = fiemap_fill_next_extent(fieinfo, start, phys, - len, flags); - if (ret == 1) - ret = 0; - } else { - ret = __generic_block_fiemap(inode, fieinfo, start, len, - gfs2_block_map); - } + ret = iomap_fiemap(inode, fieinfo, start, len, &gfs2_iomap_ops); gfs2_glock_dq_uninit(&gh); + out: - inode_unlock(inode); + inode_unlock_shared(inode); return ret; } +loff_t gfs2_seek_data(struct file *file, loff_t offset) +{ + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + loff_t ret; + + inode_lock_shared(inode); + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (!ret) + ret = iomap_seek_data(inode, offset, &gfs2_iomap_ops); + gfs2_glock_dq_uninit(&gh); + inode_unlock_shared(inode); + + if (ret < 0) + return ret; + return vfs_setpos(file, ret, inode->i_sb->s_maxbytes); +} + +loff_t gfs2_seek_hole(struct file *file, loff_t offset) +{ + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + loff_t ret; + + inode_lock_shared(inode); + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (!ret) + ret = iomap_seek_hole(inode, offset, &gfs2_iomap_ops); + gfs2_glock_dq_uninit(&gh); + inode_unlock_shared(inode); + + if (ret < 0) + return ret; + return vfs_setpos(file, ret, inode->i_sb->s_maxbytes); +} + const struct inode_operations gfs2_file_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index aace8ce34a18..b5b6341a4f5c 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -109,6 +109,8 @@ extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); extern int gfs2_open_common(struct inode *inode, struct file *file); +extern loff_t gfs2_seek_data(struct file *file, loff_t offset); +extern loff_t gfs2_seek_hole(struct file *file, loff_t offset); extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 8e54f2e3a304..9cb5c9a97d69 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -754,14 +754,15 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); struct backing_dev_info *bdi = inode_to_bdi(metamapping->host); int ret = 0; + bool flush_all = (wbc->sync_mode == WB_SYNC_ALL || gfs2_is_jdata(ip)); - if (wbc->sync_mode == WB_SYNC_ALL) + if (flush_all) gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); if (bdi->wb.dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); - if (wbc->sync_mode == WB_SYNC_ALL) + if (flush_all) ret = filemap_fdatawait(metamapping); if (ret) mark_inode_dirty_sync(inode); diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 49ac55da4e33..f67a709589d3 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM gfs2 @@ -12,6 +13,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/writeback.h> #include <linux/ktime.h> +#include <linux/iomap.h> #include "incore.h" #include "glock.h" #include "rgrp.h" @@ -469,6 +471,70 @@ TRACE_EVENT(gfs2_bmap, __entry->errno) ); +TRACE_EVENT(gfs2_iomap_start, + + TP_PROTO(const struct gfs2_inode *ip, loff_t pos, ssize_t length, + u16 flags), + + TP_ARGS(ip, pos, length, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, inum ) + __field( loff_t, pos ) + __field( ssize_t, length ) + __field( u16, flags ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->inum = ip->i_no_addr; + __entry->pos = pos; + __entry->length = length; + __entry->flags = flags; + ), + + TP_printk("%u,%u bmap %llu iomap start %llu/%lu flags:%08x", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->pos, + (unsigned long)__entry->length, (u16)__entry->flags) +); + +TRACE_EVENT(gfs2_iomap_end, + + TP_PROTO(const struct gfs2_inode *ip, struct iomap *iomap, int ret), + + TP_ARGS(ip, iomap, ret), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, inum ) + __field( loff_t, offset ) + __field( ssize_t, length ) + __field( u16, flags ) + __field( u16, type ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->inum = ip->i_no_addr; + __entry->offset = iomap->offset; + __entry->length = iomap->length; + __entry->flags = iomap->flags; + __entry->type = iomap->type; + __entry->ret = ret; + ), + + TP_printk("%u,%u bmap %llu iomap end %llu/%lu ty:%d flags:%08x rc:%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->offset, + (unsigned long)__entry->length, (u16)__entry->type, + (u16)__entry->flags, __entry->ret) +); + /* Keep track of blocks as they are allocated/freed */ TRACE_EVENT(gfs2_block_alloc, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index affef3c066e0..a85ca8b2c9ba 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -145,7 +145,7 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, * * This is used in two distinct cases: * i) In ordered write mode - * We put the data buffer on a list so that we can ensure that its + * We put the data buffer on a list so that we can ensure that it's * synced to disk at the right time * ii) In journaled data mode * We need to journal the data block in the same way as metadata in diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index ea09e41dbb49..05de20954659 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -231,7 +231,6 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct gfs2_holder rg_gh; - struct buffer_head *dibh; __be64 *dataptrs; u64 bn = 0; u64 bstart = 0; @@ -308,13 +307,8 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, ea->ea_num_ptrs = 0; } - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - ip->i_inode.i_ctime = current_time(&ip->i_inode); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } + ip->i_inode.i_ctime = current_time(&ip->i_inode); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); gfs2_trans_end(sdp); @@ -616,7 +610,6 @@ static int gfs2_xattr_get(const struct xattr_handler *handler, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; - bool need_unlock = false; int ret; /* During lookup, SELinux calls this function with the glock locked. */ @@ -625,10 +618,11 @@ static int gfs2_xattr_get(const struct xattr_handler *handler, ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); if (ret) return ret; - need_unlock = true; + } else { + gfs2_holder_mark_uninitialized(&gh); } ret = __gfs2_xattr_get(inode, name, buffer, size, handler->flags); - if (need_unlock) + if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); return ret; } @@ -749,7 +743,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, ea_skeleton_call_t skeleton_call, void *private) { struct gfs2_alloc_parms ap = { .target = blks }; - struct buffer_head *dibh; int error; error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); @@ -774,13 +767,8 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) goto out_end_trans; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - ip->i_inode.i_ctime = current_time(&ip->i_inode); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } + ip->i_inode.i_ctime = current_time(&ip->i_inode); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); out_end_trans: gfs2_trans_end(GFS2_SB(&ip->i_inode)); @@ -891,7 +879,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct ea_set *es) { struct gfs2_ea_request *er = es->es_er; - struct buffer_head *dibh; int error; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0); @@ -908,14 +895,9 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (es->es_el) ea_set_remove_stuffed(ip, es->es_el); - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out; ip->i_inode.i_ctime = current_time(&ip->i_inode); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); -out: + __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + gfs2_trans_end(GFS2_SB(&ip->i_inode)); return error; } @@ -1111,7 +1093,6 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) { struct gfs2_ea_header *ea = el->el_ea; struct gfs2_ea_header *prev = el->el_prev; - struct buffer_head *dibh; int error; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); @@ -1132,13 +1113,8 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) ea->ea_type = GFS2_EATYPE_UNUSED; } - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - ip->i_inode.i_ctime = current_time(&ip->i_inode); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } + ip->i_inode.i_ctime = current_time(&ip->i_inode); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); @@ -1268,11 +1244,20 @@ static int gfs2_xattr_set(const struct xattr_handler *handler, if (ret) return ret; - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (ret) - return ret; + /* May be called from gfs_setattr with the glock locked. */ + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + return ret; + } else { + if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) + return -EIO; + gfs2_holder_mark_uninitialized(&gh); + } ret = __gfs2_xattr_set(inode, name, value, size, flags, handler->flags); - gfs2_glock_dq_uninit(&gh); + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); return ret; } diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 0933600e11c8..74fa62643136 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfs/attr.c * diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index de69d8a24f6d..4af318fbda77 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfs/bfind.c * diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index d77d844b668b..8aec5e732abf 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfs/bnode.c * diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 6fc766df0461..ad04a5741016 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfs/brec.c * diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 37cdd955eceb..374b5688e29e 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfs/btree.c * diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index f6bd266d70b5..c8b252dbb26c 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/hfs/btree.h * diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile index 683fca2e5e65..f6a56542f8d7 100644 --- a/fs/hfsplus/Makefile +++ b/fs/hfsplus/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # ## Makefile for the linux hfsplus filesystem routines. # diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h index 95c8ed9ec17f..488c2b75cf41 100644 --- a/fs/hfsplus/acl.h +++ b/fs/hfsplus/acl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/hfsplus/acl.h * diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index e5b221de7de6..2bab6b3cdba4 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/attributes.c * diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 528e38b5af7f..ca2ba8c9f82e 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/bfind.c * diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index c0ae274c0a22..cebce0cfe340 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/bitmap.c * diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index ce014ceb89ef..d77015c3f22c 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/bnode.c * diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 754fdf8c6356..808f4d8c859c 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/brec.c * diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index d9d1a36ba826..de14b2b6881b 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/btree.c * diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index a5e00f7a4c14..a196369ba779 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/catalog.c * diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 31d5e3f1fe17..e8120a282435 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/dir.c * diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index a3eb640b4f8f..e8770935ce6d 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/extents.c * diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a3f03b247463..a015044daa05 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/hfsplus_fs.h * diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 8298d0985f81..456e87aec7fd 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/hfsplus_raw.h * diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4f26b6877130..190c60efbc99 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/inode.c * diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 0a156d84e67d..5e6502ef7415 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/ioctl.c * diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index bb806e58c977..047e05c57560 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/options.c * diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index 6bb5d7c42888..066114dcc3a2 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/posix_acl.c * diff --git a/fs/hfsplus/tables.c b/fs/hfsplus/tables.c index 1b911730a0c1..a5fb8ee7d019 100644 --- a/fs/hfsplus/tables.c +++ b/fs/hfsplus/tables.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/tables.c * diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index e563939882f3..dfa90c21948f 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/unicode.c * diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 10032b919a85..08c1580bdf7a 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/wrapper.c * diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index d37bb88dc746..e538b758c448 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/xattr.c * diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index 68f6b539371f..a4e611d69710 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/hfsplus/xattr.h * diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index 37b3efa733ef..f5550b006e88 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/xattr_trusted.c * diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index 94519d6c627d..fbad91e1dada 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/xattr_trusted.c * diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index fae6c0ea0030..74d19faf255e 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/xattr_user.c * diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 91e19f9dffe5..ffaec2e7526c 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_FS_HOSTFS #define __UM_FS_HOSTFS diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index 098bf0f4f386..66617b1557c6 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/alloc.c * diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c index 2d5b254ad9e2..c14c9a035ee0 100644 --- a/fs/hpfs/anode.c +++ b/fs/hpfs/anode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/anode.c * diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index f626114449e4..e285d6b3bba4 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/buffer.c * diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c index bb87d65f0d97..89a36fdc68cb 100644 --- a/fs/hpfs/dentry.c +++ b/fs/hpfs/dentry.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/dentry.c * diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index fa6bbb4f509f..8d6b7e35faf9 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/dir.c * diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index 86ab7e790b4e..3b834563b1f1 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/dnode.c * diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index ce3f98ba993a..102ba18e561f 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/ea.c * diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index f26138425b16..1ecec124e76f 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/file.c * diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index cce025aff1b1..823a328791c0 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/hpfs/hpfs.h * diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index d352f3a6af7f..2577ef1034ef 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/hpfs/hpfs_fn.h * diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index b9c724ed1e7e..eb8b4baf0f2e 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/inode.c * diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index a136929189f0..e0e60b148400 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/map.c * diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c index b00d396d22c6..ef7ba77f36b8 100644 --- a/fs/hpfs/name.c +++ b/fs/hpfs/name.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/name.c * diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index f30c14414518..a3615e4c730d 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/namei.c * diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 59073e9f01a4..ed113ea17aff 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -842,9 +842,12 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping, struct page *page) { struct inode *inode = mapping->host; + pgoff_t index = page->index; remove_huge_page(page); - hugetlb_fix_reserve_counts(inode); + if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) + hugetlb_fix_reserve_counts(inode); + return 0; } diff --git a/fs/inode.c b/fs/inode.c index d1e35b53bb23..fd401028a309 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2090,7 +2090,7 @@ void inode_set_flags(struct inode *inode, unsigned int flags, WARN_ON_ONCE(flags & ~mask); do { - old_flags = ACCESS_ONCE(inode->i_flags); + old_flags = READ_ONCE(inode->i_flags); new_flags = (old_flags & ~mask) | flags; } while (unlikely(cmpxchg(&inode->i_flags, old_flags, new_flags) != old_flags)); diff --git a/fs/ioctl.c b/fs/ioctl.c index 569db68d02b3..5ace7efb0d04 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ioctl.c * diff --git a/fs/iomap.c b/fs/iomap.c index be61cf742b5e..5011a964a550 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -350,8 +350,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, struct iomap *iomap) { - sector_t sector = iomap->blkno + - (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); + sector_t sector = (iomap->addr + + (pos & PAGE_MASK) - iomap->offset) >> 9; return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector, offset, bytes); @@ -510,11 +510,12 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, flags |= FIEMAP_EXTENT_MERGED; if (iomap->flags & IOMAP_F_SHARED) flags |= FIEMAP_EXTENT_SHARED; + if (iomap->flags & IOMAP_F_DATA_INLINE) + flags |= FIEMAP_EXTENT_DATA_INLINE; return fiemap_fill_next_extent(fi, iomap->offset, - iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, + iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, iomap->length, flags); - } static loff_t @@ -714,23 +715,9 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) { struct kiocb *iocb = dio->iocb; struct inode *inode = file_inode(iocb->ki_filp); + loff_t offset = iocb->ki_pos; ssize_t ret; - /* - * Try again to invalidate clean pages which might have been cached by - * non-direct readahead, or faulted in by get_user_pages() if the source - * of the write was an mmap'ed region of the file we're writing. Either - * one is a pretty crazy thing to do, so we don't support it 100%. If - * this invalidation fails, tough, the write still worked... - */ - if (!dio->error && - (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { - ret = invalidate_inode_pages2_range(inode->i_mapping, - iocb->ki_pos >> PAGE_SHIFT, - (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - } - if (dio->end_io) { ret = dio->end_io(iocb, dio->error ? dio->error : dio->size, @@ -742,12 +729,33 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) if (likely(!ret)) { ret = dio->size; /* check for short read */ - if (iocb->ki_pos + ret > dio->i_size && + if (offset + ret > dio->i_size && !(dio->flags & IOMAP_DIO_WRITE)) - ret = dio->i_size - iocb->ki_pos; + ret = dio->i_size - offset; iocb->ki_pos += ret; } + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + * + * And this page cache invalidation has to be after dio->end_io(), as + * some filesystems convert unwritten extents to real allocations in + * end_io() when necessary, otherwise a racing buffer read would cache + * zeros from unwritten extents. + */ + if (!dio->error && + (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { + int err; + err = invalidate_inode_pages2_range(inode->i_mapping, + offset >> PAGE_SHIFT, + (offset + dio->size - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(err); + } + inode_dio_end(file_inode(iocb->ki_filp)); kfree(dio); @@ -823,7 +831,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio = bio_alloc(GFP_KERNEL, 1); bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = - iomap->blkno + ((pos - iomap->offset) >> 9); + (iomap->addr + pos - iomap->offset) >> 9; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -902,7 +910,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, bio = bio_alloc(GFP_KERNEL, nr_pages); bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = - iomap->blkno + ((pos - iomap->offset) >> 9); + (iomap->addr + pos - iomap->offset) >> 9; bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; diff --git a/fs/isofs/Makefile b/fs/isofs/Makefile index bf162f0942d5..6498fd2b0f60 100644 --- a/fs/isofs/Makefile +++ b/fs/isofs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux isofs filesystem routines. # diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index e7599615e4e0..947ce22f5b3c 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/dir.c * diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 0c5f721b4e91..85a9093769a9 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/isofs/export.c * diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 133a456b0425..57d4c3e2e94a 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index a048de81c093..be8b6a9d0b92 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/joliet.c * diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index aee592767f1d..cac468f04820 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/namei.c * diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 0ec137310320..94ef92fe806c 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/rock.c * diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h index ed09e2b08637..ef03625431bb 100644 --- a/fs/isofs/rock.h +++ b/fs/isofs/rock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * These structs are used by the system-use-sharing protocol, in which the * Rock Ridge extensions are embedded. It is quite possible that other diff --git a/fs/isofs/util.c b/fs/isofs/util.c index 005a15cfd30a..42544bf0e222 100644 --- a/fs/isofs/util.c +++ b/fs/isofs/util.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/util.c */ diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 7d5ef3bf3f3e..d2a85c9720e9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -165,11 +165,11 @@ static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) * Helper function used to manage commit timeouts */ -static void commit_timeout(unsigned long __data) +static void commit_timeout(struct timer_list *t) { - struct task_struct * p = (struct task_struct *) __data; + journal_t *journal = from_timer(journal, t, j_commit_timer); - wake_up_process(p); + wake_up_process(journal->j_task); } /* @@ -197,8 +197,7 @@ static int kjournald2(void *arg) * Set up an interval timer which can be used to trigger a commit wakeup * after the commit interval expires */ - setup_timer(&journal->j_commit_timer, commit_timeout, - (unsigned long)current); + timer_setup(&journal->j_commit_timer, commit_timeout, 0); set_freezable(); diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile index 60e5d49ca03e..5294969d5bf9 100644 --- a/fs/jffs2/Makefile +++ b/fs/jffs2/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux Journalling Flash File System v2 (JFFS2) # diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile index d20d4737b3ef..285ec189ed5c 100644 --- a/fs/jfs/Makefile +++ b/fs/jfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux JFS filesystem routines. # diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 5c5ac5b3aec3..ba34dae8bd9f 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/jfs/ioctl.c * diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 1c4b9ad4d7ab..1a3b0cc22ad3 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -663,6 +663,8 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, } else { INCREMENT(mpStat.pagealloc); mp = alloc_metapage(GFP_NOFS); + if (!mp) + goto unlock; mp->page = page; mp->sb = inode->i_sb; mp->flag = 0; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2f14677169c3..2f7b3af5b8b7 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -853,7 +853,6 @@ out: } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); - inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); inode_unlock(inode); diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 9b320cc2a8cf..6d5e83ed4476 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux lock manager stuff # diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index c349fc0f9b80..00d5ef5f99f7 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/lockd/clnt4xdr.c * diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 3b4724a6c4ee..2c6176387143 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/lockd/clntxdr.c * diff --git a/fs/lockd/host.c b/fs/lockd/host.c index d716c9993a26..0d4e590e0549 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/lockd/host.c * diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 9d8166c39c54..9fbbd11f9ecb 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/lockd/mon.c * diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index fb8cac88251a..5bec78c8e431 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LOCKD_NETNS_H__ #define __LOCKD_NETNS_H__ diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c index 8f72cb237ef3..ca9228a56d65 100644 --- a/fs/lockd/procfs.c +++ b/fs/lockd/procfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Procfs support for lockd * diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h index 184a15edd18d..ba9a82f4ce28 100644 --- a/fs/lockd/procfs.h +++ b/fs/lockd/procfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Procfs support for lockd * diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 82925f17ec45..1bddf70d9656 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/lockd/svc4proc.c * diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 3507c80d1d4b..3701bccab478 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/lockd/svclock.c * diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 07915162581d..0d670c5c378f 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/lockd/svcproc.c * diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index b0ae07008700..ade4931b2da2 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/lockd/svcshare.c * diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 442bbd0b0b29..7147e4aebecc 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/lockd/xdr.c * diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 2a0cd5679c49..7ed9edf9aed4 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/lockd/xdr4.c * diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index c2c3fd3277b5..f4e5e5181a14 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/minix/bitmap.c * diff --git a/fs/minix/dir.c b/fs/minix/dir.c index baa9721f1299..dcfe5b25378b 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/minix/dir.c * diff --git a/fs/minix/file.c b/fs/minix/file.c index a6a4797aa0d4..c50b0a20fcd9 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/minix/file.c * diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index 2d1ca08870f7..043c3fdbc8e7 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Generic part */ typedef struct { diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c index 46ca39d6c735..046cc96ee7ad 100644 --- a/fs/minix/itree_v1.c +++ b/fs/minix/itree_v1.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/buffer_head.h> #include <linux/slab.h> #include "minix.h" diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c index 1ee101352586..f7fc7ecccccc 100644 --- a/fs/minix/itree_v2.c +++ b/fs/minix/itree_v2.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/buffer_head.h> #include "minix.h" diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 663d66138d06..df081e8afcc3 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef FS_MINIX_H #define FS_MINIX_H diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 1e0f11f5dac9..ccf0f00030bf 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/minix/namei.c * diff --git a/fs/mount.h b/fs/mount.h index 6790767d1883..f39bc9da4d73 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/poll.h> diff --git a/fs/mpage.c b/fs/mpage.c index 37bb77c1302c..b7e7f570733a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/mpage.c * @@ -468,6 +469,16 @@ static void clean_buffers(struct page *page, unsigned first_unmapped) try_to_free_buffers(page); } +/* + * For situations where we want to clean all buffers attached to a page. + * We don't need to calculate how many buffers are attached to the page, + * we just need to specify a number larger than the maximum number of buffers. + */ +void clean_page_buffers(struct page *page) +{ + clean_buffers(page, ~0U); +} + static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -605,10 +616,8 @@ alloc_new: if (bio == NULL) { if (first_unmapped == blocks_per_page) { if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), - page, wbc)) { - clean_buffers(page, first_unmapped); + page, wbc)) goto out; - } } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); diff --git a/fs/namei.c b/fs/namei.c index c75ea03ca147..5424b10cfdc4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/namei.c * @@ -1209,7 +1210,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see * the components of that value change under us */ - while (managed = ACCESS_ONCE(path->dentry->d_flags), + while (managed = READ_ONCE(path->dentry->d_flags), managed &= DCACHE_MANAGED_DENTRY, unlikely(managed != 0)) { /* Allow the filesystem to manage the transit without i_mutex @@ -1394,7 +1395,7 @@ int follow_down(struct path *path) unsigned managed; int ret; - while (managed = ACCESS_ONCE(path->dentry->d_flags), + while (managed = READ_ONCE(path->dentry->d_flags), unlikely(managed & DCACHE_MANAGED_DENTRY)) { /* Allow the filesystem to manage the transit without i_mutex * being held. diff --git a/fs/namespace.c b/fs/namespace.c index 3b601f115b6c..e158ec6b527b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -353,7 +353,7 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) cpu_relax(); /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will @@ -2825,7 +2825,8 @@ long do_mount(const char *dev_name, const char __user *dir_name, SB_MANDLOCK | SB_DIRSYNC | SB_SILENT | - SB_POSIXACL); + SB_POSIXACL | + SB_I_VERSION); if (flags & MS_REMOUNT) retval = do_remount(&path, flags, sb_flags, mnt_flags, diff --git a/fs/ncpfs/Makefile b/fs/ncpfs/Makefile index c66af563f2ce..66fe5f878817 100644 --- a/fs/ncpfs/Makefile +++ b/fs/ncpfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux ncp filesystem routines. # diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 088f52484d6e..0c57c5c5d40a 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * dir.c * @@ -119,10 +120,6 @@ static inline int ncp_case_sensitive(const struct inode *i) /* * Note: leave the hash unchanged if the directory * is case-sensitive. - * - * Accessing the parent inode can be racy under RCU pathwalking. - * Use ACCESS_ONCE() to make sure we use _one_ particular inode, - * the callers will handle races. */ static int ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) @@ -147,11 +144,6 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) return 0; } -/* - * Accessing the parent inode can be racy under RCU pathwalking. - * Use ACCESS_ONCE() to make sure we use _one_ particular inode, - * the callers will handle races. - */ static int ncp_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index a06c07619ee6..8f8cc0334ddd 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * file.c * diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 344889cd120e..5c941bef14c4 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * getopt.c */ diff --git a/fs/ncpfs/getopt.h b/fs/ncpfs/getopt.h index cccc007dcaf9..30f0da317670 100644 --- a/fs/ncpfs/getopt.h +++ b/fs/ncpfs/getopt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_GETOPT_H #define _LINUX_GETOPT_H diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 6d0f14c86099..129f1937fa2c 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -618,7 +618,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) server->tx.creq = NULL; server->rcv.creq = NULL; - init_timer(&server->timeout_tm); + timer_setup(&server->timeout_tm, ncpdgram_timeout_call, 0); #undef NCP_PACKET_SIZE #define NCP_PACKET_SIZE 131072 error = -ENOMEM; @@ -650,8 +650,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) } else { INIT_WORK(&server->rcv.tq, ncpdgram_rcv_proc); INIT_WORK(&server->timeout_tq, ncpdgram_timeout_proc); - server->timeout_tm.data = (unsigned long)server; - server->timeout_tm.function = ncpdgram_timeout_call; } release_sock(sock->sk); diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 12550c2320cc..d378b98cd7b6 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ioctl.c * diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 6719c0be674d..a5c5cf2ff007 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * mmap.c * diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h index b9f69e1b1f43..bdd262b6c198 100644 --- a/fs/ncpfs/ncp_fs.h +++ b/fs/ncpfs/ncp_fs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/ncp_fs.h> #include "ncp_fs_i.h" #include "ncp_fs_sb.h" diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h index c4794504f843..3432bafb53a5 100644 --- a/fs/ncpfs/ncp_fs_i.h +++ b/fs/ncpfs/ncp_fs_i.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ncp_fs_i.h * diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index 366fd63cc506..f06cde4adf71 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ncp_fs_sb.h * @@ -149,7 +150,7 @@ extern void ncp_tcp_rcv_proc(struct work_struct *work); extern void ncp_tcp_tx_proc(struct work_struct *work); extern void ncpdgram_rcv_proc(struct work_struct *work); extern void ncpdgram_timeout_proc(struct work_struct *work); -extern void ncpdgram_timeout_call(unsigned long server); +extern void ncpdgram_timeout_call(struct timer_list *t); extern void ncp_tcp_data_ready(struct sock* sk); extern void ncp_tcp_write_space(struct sock* sk); extern void ncp_tcp_error_report(struct sock* sk); diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index 88dbbc9fcf4d..804adfebba2f 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ncplib_kernel.c * diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index b4c87cfcee95..aaae8aa9bf7d 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ncplib_kernel.h * diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c index 08907599dcd2..8085b1a3ba47 100644 --- a/fs/ncpfs/ncpsign_kernel.c +++ b/fs/ncpfs/ncpsign_kernel.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ncpsign_kernel.c * diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h index d9a1438bb1f6..57ff0a0650b8 100644 --- a/fs/ncpfs/ncpsign_kernel.h +++ b/fs/ncpfs/ncpsign_kernel.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ncpsign_kernel.h * diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c index 98b6db0ed63e..efb176b1751a 100644 --- a/fs/ncpfs/sock.c +++ b/fs/ncpfs/sock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ncpfs/sock.c * @@ -116,10 +117,10 @@ void ncp_tcp_write_space(struct sock *sk) schedule_work(&server->tx.tq); } -void ncpdgram_timeout_call(unsigned long v) +void ncpdgram_timeout_call(struct timer_list *t) { - struct ncp_server *server = (void*)v; - + struct ncp_server *server = from_timer(server, t, timeout_tm); + schedule_work(&server->timeout_tq); } diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index a6d26b46fc05..b6e16da4837a 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ncpfs/symlink.c * diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 1fb118902d57..c587e3c4c6a6 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux nfs filesystem routines. # diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index a69ef4e9c24c..95f74bd2c067 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014-2016 Christoph Hellwig. */ diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index c85fbfd2d0d9..7a57ff2528af 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014-2016 Christoph Hellwig. */ diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 2ae676f93e6b..b60627bcfc62 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/cache_lib.c * diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 4116d2c3f52f..4e6236a86cf7 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Helper routines for the NFS client caches * diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 2cddf7f437e6..cd9d992feb2e 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/callback.c * diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 3dc54d7cb19c..a20a0bce40a4 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/nfs/callback.h * diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 14358de173fb..19151f6c0e97 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/callback_proc.c * diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 681dd642f119..123c069429a7 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/callback_xdr.c * diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index e9d555796873..ddaf2644cf13 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/nfs/delegation.h * diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5ceaeb1f6fb6..f439f1c45008 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1081,7 +1081,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) int error; if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; @@ -1168,7 +1168,7 @@ out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: if (flags & LOOKUP_RCU) { - if (parent != ACCESS_ONCE(dentry->d_parent)) + if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; } else dput(parent); @@ -1582,7 +1582,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) struct inode *dir; if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; @@ -1596,7 +1596,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) ret = -ECHILD; if (!(flags & LOOKUP_RCU)) dput(parent); - else if (parent != ACCESS_ONCE(dentry->d_parent)) + else if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; goto out; } diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index d25f10fb4926..060c658eab66 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/dns_resolve.c * diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h index 2e4f596d2923..576ff4b54c82 100644 --- a/fs/nfs/dns_resolve.h +++ b/fs/nfs/dns_resolve.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Resolve DNS hostnames into valid ip addresses */ diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 249cb96cc5b5..83fd09fc8f77 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2015, Primary Data, Inc. All rights reserved. * diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 98b34c9b0564..679cb087ef3f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NFSv4 flexfile layout driver data structures. * diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index f32c58bbe556..d62279d3fc5d 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Device operations for the pnfs nfs4 file layout driver. * diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5bdf952f414b..f9a4a5524bd5 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NFS internal definitions */ diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 1fc5d1ce327e..20fef85d2bb1 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016 Trond Myklebust * diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 0cb806fbd4c4..2ddaab1ac653 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/nfs/iostat.h * diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 60bad882c123..d979ff4fee7e 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * In-kernel MOUNT protocol client * diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index 5fbd2bde91ba..fc9978c58265 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NFS-private data for each "struct net". Accessed with net_generic(). */ diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h index 43679df56cd0..5ba00610aede 100644 --- a/fs/nfs/nfs.h +++ b/fs/nfs/nfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2012 Netapp, Inc. All rights reserved. * diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index fe68dabfbde6..85e4b4a233f9 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/nfs2xdr.c * diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index e134d6548ab7..f82e11c4cb56 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Anna Schumaker. * diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 720d92f5abfb..7173a4ee862c 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/gfp.h> #include <linux/nfs.h> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d1e87ec0df84..bc673fb47fb3 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/nfs3proc.c * diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index e82c9e553224..6cd33bd5da87 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/nfs3xdr.c * diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index b6cd15314bab..19ec38f85ce0 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6c2db51e67a7..9c374441f660 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> */ diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 5ee1b0f0d904..5966e1e7b1f5 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index ac4f10b7f6c1..dcfcf7fd7438 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/nfs/nfs4_fs.h * diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 0efba77789b9..626d1382002e 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/file.c * diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index ac8406018962..1a69479a3a59 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 7d531da1bae3..8c3f327d858d 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/nfs4namespace.c * diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index dfae4880eacb..3c550f297561 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/nfs/nfs4session.h * diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 8693d77c45ea..0d91d84e5822 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/nfs4sysctl.c * diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index 2850bce19244..e9fb3e50a999 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> */ diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index be1da19c65d6..e7c6275519b0 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> */ diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 89a15dbe5efc..effaa4247b91 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de> * diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index c74f7af23d77..b60d5fbd7727 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 551711042ba4..093290c42d7c 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> */ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 7962e49097c3..f7fd9192d4bc 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/proc.c * diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 5a1d0ded8979..06eb44b47885 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/symlink.c * diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index bb6ed810fa6f..7aea195ddb35 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/sysctl.c * diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index e3949d93085c..630b4a3c1a93 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/unlink.c * diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 5f5d3a76980c..2bfb58eefad1 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux nfs server # diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 62469c60be23..697f8ae7792d 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched.h> diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h index 53325a12ba62..dbd66424f600 100644 --- a/fs/nfsd/auth.h +++ b/fs/nfsd/auth.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * nfsd-specific authentication stuff. * diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index c862c2489df0..70b8bf781fce 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014-2016 Christoph Hellwig. */ @@ -65,7 +66,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, bex->es = PNFS_BLOCK_READ_DATA; else bex->es = PNFS_BLOCK_READWRITE_DATA; - bex->soff = (iomap.blkno << 9); + bex->soff = iomap.addr; break; case IOMAP_UNWRITTEN: if (seg->iomode & IOMODE_RW) { @@ -78,7 +79,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, } bex->es = PNFS_BLOCK_INVALID_DATA; - bex->soff = (iomap.blkno << 9); + bex->soff = iomap.addr; break; } /*FALLTHRU*/ diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index ac6f54546fdd..442543304930 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014-2016 Christoph Hellwig. */ diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index 397bc7563a49..bc5166bfe46b 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NFSD_BLOCKLAYOUTXDR_H #define _NFSD_BLOCKLAYOUTXDR_H 1 diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index dd96a3830004..046b3f048757 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Request reply cache. This was heavily inspired by the * implementation in 4.3BSD/4.4BSD. diff --git a/fs/nfsd/current_stateid.h b/fs/nfsd/current_stateid.h index 34075cee573a..c28540d86742 100644 --- a/fs/nfsd/current_stateid.h +++ b/fs/nfsd/current_stateid.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NFSD4_CURRENT_STATE_H #define _NFSD4_CURRENT_STATE_H diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 3bc08c394a3f..46b48dbbdd32 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NFS exporting and validation. * diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 730f15eeb7ed..c8b74126ddaa 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> */ diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 34c1c449fddf..6dfede6d172a 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com> * diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index b67287383010..db7ef07ae50c 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> * diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c index 5e3fd7fc1a9f..e81d2a5cf381 100644 --- a/fs/nfsd/flexfilelayoutxdr.c +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> */ diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h index 467defd4e563..8e195aeca023 100644 --- a/fs/nfsd/flexfilelayoutxdr.h +++ b/fs/nfsd/flexfilelayoutxdr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> */ diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 1a03bc3059e8..3f5b3d7b62b7 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file contains all the stubs needed when communicating with lockd. * This level of indirection is necessary so we can run nfsd+lockd without diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 6276ec8608b0..cbab1d2d8a75 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Process version 2 NFSACL requests. * diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 01976529f042..13bca4a2f89d 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Process version 3 NFSACL requests. * diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 2cb56a0d6625..1d0ce3c57d93 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Process version 3 NFS requests. * diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index bf444b664011..f38acd905441 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * XDR support for nfsd/protocol version 3. * diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index e122da696f1b..ea45d954e8d7 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Christoph Hellwig. */ diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 96fd15979cbd..334f2ad60704 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Request reply cache. This is currently a global cache, but this may * change in the future and be a per-client cache. diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index b9c538ab7a59..3fce905d0365 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Hodge-podge collection of knfsd-related stuff. * I will sort this out later. diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index cfe7500d5847..8aa011820c4a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NFS server file handle treatment. * diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index e47cf6c2ac28..43f31cf49bae 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> * diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 5076ae2b8258..43c0419b8ddb 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Process version 2 NFS requests. * diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 7e3af3ef0917..e02bd2783124 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Central processing for nfsd. * diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index e4da2717982d..644a0342f0e0 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * XDR support for nfsd * diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index d27a5aa60022..4f4282d4eeca 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_NFSD_PNFS_H #define _FS_NFSD_PNFS_H 1 diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index d97338bb6a39..9bce3b913189 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * procfs-based user access to knfsd statistics * diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index a5c944b771c6..b23fdac69820 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Statistics for NFS server. * diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 3287041905da..8b2f1d92c579 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2014 Christoph Hellwig. */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index bc69d40c4e8b..a3c9bfa77def 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * File operations used by nfsd. Some of these have been ripped from * other parts of the kernel because they weren't exported, others diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 1bbdccecbf3d..be6d8e00453f 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> */ diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 457ce45e5084..2f4f22e6b8cb 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* XDR types for nfsd. This is mainly a typing exercise. */ #ifndef LINUX_NFSD_H diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 80d7da620e91..056bf8a7364e 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * XDR types for NFSv3 in nfsd. * diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 49b719dfef95..517239af0302 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #define NFS4_MAXTAGLEN 20 #define NFS4_enc_cb_null_sz 0 diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile index fc603e0431bb..43b60b8a4d07 100644 --- a/fs/nilfs2/Makefile +++ b/fs/nilfs2/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_NILFS2_FS) += nilfs2.o nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ btnode.o bmap.o btree.o direct.o dat.o recovery.o \ diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h index 00107fdb9343..d29fd837c42c 100644 --- a/fs/nilfs2/export.h +++ b/fs/nilfs2/export.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef NILFS_EXPORT_H #define NILFS_EXPORT_H diff --git a/fs/nls/Makefile b/fs/nls/Makefile index 8ae37c1b5249..ac54db297128 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for native language support # diff --git a/fs/notify/Makefile b/fs/notify/Makefile index 3e969ae91b60..63a4b8828df4 100644 --- a/fs/notify/Makefile +++ b/fs/notify/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o mark.o \ fdinfo.o diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 63f56b007280..54cf2d21b547 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fdtable.h> #include <linux/fsnotify_backend.h> diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index dc219cf07a6a..256d9d1ddea9 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fsnotify_backend.h> #include <linux/path.h> #include <linux/slab.h> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index a434de023c49..55499f5a1c96 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fcntl.h> #include <linux/file.h> diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index dd63aa9a6f9a..517f88c1dbe5 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/file.h> #include <linux/fs.h> #include <linux/fsnotify_backend.h> diff --git a/fs/notify/fdinfo.h b/fs/notify/fdinfo.h index 9664c4904d6b..5c9937e02e21 100644 --- a/fs/notify/fdinfo.h +++ b/fs/notify/fdinfo.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FSNOTIFY_FDINFO_H__ #define __FSNOTIFY_FDINFO_H__ diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index bf012e8ecd14..60f365dc1408 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_NOTIFY_FSNOTIFY_H_ #define __FS_NOTIFY_FSNOTIFY_H_ diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 9ff67b61da8a..c00d2caca894 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fsnotify_backend.h> #include <linux/inotify.h> #include <linux/slab.h> /* struct kmem_cache */ diff --git a/fs/nsfs.c b/fs/nsfs.c index 08127a2b8559..ef243e14b6eb 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mount.h> #include <linux/file.h> #include <linux/fs.h> diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile index 2ff263e6d363..3e736572ed00 100644 --- a/fs/ntfs/Makefile +++ b/fs/ntfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # Rules for making the NTFS driver. obj-$(CONFIG_NTFS_FS) += ntfs.o diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 4342c7ee7d20..99ee093182cb 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 ccflags-y := -Ifs/ocfs2 obj-$(CONFIG_OCFS2_FS) += \ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a177eae3aa1a..addd7c5f2d3e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7304,13 +7304,24 @@ out: static int ocfs2_trim_extent(struct super_block *sb, struct ocfs2_group_desc *gd, - u32 start, u32 count) + u64 group, u32 start, u32 count) { u64 discard, bcount; + struct ocfs2_super *osb = OCFS2_SB(sb); bcount = ocfs2_clusters_to_blocks(sb, count); - discard = le64_to_cpu(gd->bg_blkno) + - ocfs2_clusters_to_blocks(sb, start); + discard = ocfs2_clusters_to_blocks(sb, start); + + /* + * For the first cluster group, the gd->bg_blkno is not at the start + * of the group, but at an offset from the start. If we add it while + * calculating discard for first group, we will wrongly start fstrim a + * few blocks after the desried start block and the range can cross + * over into the next cluster group. So, add it only if this is not + * the first cluster group. + */ + if (group != osb->first_cluster_group_blkno) + discard += le64_to_cpu(gd->bg_blkno); trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount); @@ -7318,7 +7329,7 @@ static int ocfs2_trim_extent(struct super_block *sb, } static int ocfs2_trim_group(struct super_block *sb, - struct ocfs2_group_desc *gd, + struct ocfs2_group_desc *gd, u64 group, u32 start, u32 max, u32 minbits) { int ret = 0, count = 0, next; @@ -7337,7 +7348,7 @@ static int ocfs2_trim_group(struct super_block *sb, next = ocfs2_find_next_bit(bitmap, max, start); if ((next - start) >= minbits) { - ret = ocfs2_trim_extent(sb, gd, + ret = ocfs2_trim_extent(sb, gd, group, start, next - start); if (ret < 0) { mlog_errno(ret); @@ -7435,7 +7446,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) } gd = (struct ocfs2_group_desc *)gd_bh->b_data; - cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen); + cnt = ocfs2_trim_group(sb, gd, group, + first_bit, last_bit, minlen); brelse(gd_bh); gd_bh = NULL; if (cnt < 0) { diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 4506ec5ec2ea..ab30c005cc4b 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ocfs2/ioctl.c * diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 0cd5323bd3f0..9f5e4d95e37f 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ioctl.h * diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h index 1274ee0f1fe2..1051507cc684 100644 --- a/fs/ocfs2/mmap.h +++ b/fs/ocfs2/mmap.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef OCFS2_MMAP_H #define OCFS2_MMAP_H diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 0b58abcf1c6d..a0b5d00ef0a9 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ocfs2 diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index d153e6e31529..ebb5c99f490e 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * quota.h for OCFS2 * diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index c94b6baaa551..b39d14cbfa34 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Implementation of operations over global quota file */ diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index aa700fd10610..16c42ed0dca8 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Implementation of operations over local quota file */ diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c index 83f4e76511c2..7147ba6a6afc 100644 --- a/fs/omfs/bitmap.c +++ b/fs/omfs/bitmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/fs.h> #include <linux/buffer_head.h> diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h index f0f8bc75e609..4008be73de54 100644 --- a/fs/omfs/omfs.h +++ b/fs/omfs/omfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _OMFS_H #define _OMFS_H diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h index 83a98330ed66..caecb3d5a344 100644 --- a/fs/omfs/omfs_fs.h +++ b/fs/omfs/omfs_fs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _OMFS_FS_H #define _OMFS_FS_H diff --git a/fs/orangefs/Makefile b/fs/orangefs/Makefile index a9d6a968fe6d..9b6c50bb173b 100644 --- a/fs/orangefs/Makefile +++ b/fs/orangefs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the ORANGEFS filesystem. # diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 9108ef433e6d..c2d8233b1e82 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index 5355efba4bc8..ae782df5c063 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index 2826859bdc2c..ded456f17de6 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index d327cbd17756..a8cc588d6224 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright 2017 Omnibond Systems, L.L.C. */ diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h index 163001c95501..ea2332e16af9 100644 --- a/fs/orangefs/downcall.h +++ b/fs/orangefs/downcall.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 336ecbf8c268..e4a8e6a7eb17 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 9428ea0aac16..28825a5b6d09 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 478e88bd7f9d..7e9e5d0ea3bc 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 7ef473f3d642..59f444dced9b 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h index 71f64f4057b5..c2c3c5a0eeab 100644 --- a/fs/orangefs/orangefs-bufmap.h +++ b/fs/orangefs/orangefs-bufmap.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c index aa3830b741c7..3b6982bf6bcf 100644 --- a/fs/orangefs/orangefs-cache.c +++ b/fs/orangefs/orangefs-cache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h index 387db17cde2b..b6001bb28f5a 100644 --- a/fs/orangefs/orangefs-debug.h +++ b/fs/orangefs/orangefs-debug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 5f59917fd631..1c59dff530de 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * What: /sys/kernel/debug/orangefs/debug-help * Date: June 2015 diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h index 803517269ba6..b5fd9cd4960f 100644 --- a/fs/orangefs/orangefs-debugfs.h +++ b/fs/orangefs/orangefs-debugfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ int orangefs_debugfs_init(int); void orangefs_debugfs_cleanup(void); int orangefs_client_debug_init(void); diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h index efe08c763e56..dc6609824965 100644 --- a/fs/orangefs/orangefs-dev-proto.h +++ b/fs/orangefs/orangefs-dev-proto.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index ea0ce507a6ab..004af348fb80 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index afd2f523b283..079a465796f3 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Documentation/ABI/stable/orangefs-sysfs: * diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index aab6f1842963..f82336496311 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 48bcc1bbe415..e0bf5e4dce0d 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/spinlock_types.h> diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 47f3fb9cbec4..47ebd9bfd1a1 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c index 02b1bbdbcc42..d856cdf91763 100644 --- a/fs/orangefs/symlink.c +++ b/fs/orangefs/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h index b8249f8fdd80..16118452aa12 100644 --- a/fs/orangefs/upcall.h +++ b/fs/orangefs/upcall.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index 61e2ca7fec55..835c6e148afc 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * (C) 2011 Omnibond Systems diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 81ac88bb91ff..03bcb871544d 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index a619addecafc..321511ed8c42 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -598,18 +598,30 @@ static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, return true; } -struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry) +struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry, + struct dentry *index) { struct dentry *lowerdentry = ovl_dentry_lower(dentry); struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; + /* Already indexed or could be indexed on copy up? */ + bool indexed = (index || (ovl_indexdir(dentry->d_sb) && !upperdentry)); + + if (WARN_ON(upperdentry && indexed && !lowerdentry)) + return ERR_PTR(-EIO); if (!realinode) realinode = d_inode(lowerdentry); - if (!S_ISDIR(realinode->i_mode) && - (upperdentry || (lowerdentry && ovl_indexdir(dentry->d_sb)))) { - struct inode *key = d_inode(lowerdentry ?: upperdentry); + /* + * Copy up origin (lower) may exist for non-indexed upper, but we must + * not use lower as hash key in that case. + * Hash inodes that are or could be indexed by origin inode and + * non-indexed upper inodes that could be hard linked by upper inode. + */ + if (!S_ISDIR(realinode->i_mode) && (upperdentry || indexed)) { + struct inode *key = d_inode(indexed ? lowerdentry : + upperdentry); unsigned int nlink; inode = iget5_locked(dentry->d_sb, (unsigned long) key, diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 654bea1a5ac9..a12dc10bf726 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -405,14 +405,13 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack, * be treated as stale (i.e. after unlink of the overlay inode). * We don't know the verification rules for directory and whiteout * index entries, because they have not been implemented yet, so return - * EROFS if those entries are found to avoid corrupting an index that - * was created by a newer kernel. + * EINVAL if those entries are found to abort the mount to avoid + * corrupting an index that was created by a newer kernel. */ - err = -EROFS; + err = -EINVAL; if (d_is_dir(index) || ovl_is_whiteout(index)) goto fail; - err = -EINVAL; if (index->d_name.len < sizeof(struct ovl_fh)*2) goto fail; @@ -507,6 +506,10 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); + if (err == -ENOENT) { + index = NULL; + goto out; + } pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, @@ -516,18 +519,9 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, inode = d_inode(index); if (d_is_negative(index)) { - if (upper && d_inode(origin)->i_nlink > 1) { - pr_warn_ratelimited("overlayfs: hard link with origin but no index (ino=%lu).\n", - d_inode(origin)->i_ino); - goto fail; - } - - dput(index); - index = NULL; + goto out_dput; } else if (upper && d_inode(upper) != inode) { - pr_warn_ratelimited("overlayfs: wrong index found (index=%pd2, ino=%lu, upper ino=%lu).\n", - index, inode->i_ino, d_inode(upper)->i_ino); - goto fail; + goto out_dput; } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || ((inode->i_mode ^ d_inode(origin)->i_mode) & S_IFMT)) { /* @@ -547,6 +541,11 @@ out: kfree(name.name); return index; +out_dput: + dput(index); + index = NULL; + goto out; + fail: dput(index); index = ERR_PTR(-EIO); @@ -635,6 +634,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } if (d.redirect) { + err = -ENOMEM; upperredirect = kstrdup(d.redirect, GFP_KERNEL); if (!upperredirect) goto out_put_upper; @@ -709,7 +709,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperdentry = dget(index); if (upperdentry || ctr) { - inode = ovl_get_inode(dentry, upperdentry); + inode = ovl_get_inode(dentry, upperdentry, index); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free_oe; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index c706a6f99928..d9a0edd4e57e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -286,7 +286,8 @@ int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); bool ovl_is_private_xattr(const char *name); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); -struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry); +struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry, + struct dentry *index); static inline void ovl_copyattr(struct inode *from, struct inode *to) { to->i_uid = from->i_uid; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 25d9b5adcd42..36b49bd09264 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode) static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) { - return lockless_dereference(oi->__upperdentry); + return READ_ONCE(oi->__upperdentry); } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 0f85ee9c3268..c310e3ff7f3f 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -754,7 +754,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { struct inode *inode = file_inode(file); - realfile = lockless_dereference(od->upperfile); + realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; @@ -1021,13 +1021,12 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, break; } err = ovl_verify_index(index, lowerstack, numlower); - if (err) { - if (err == -EROFS) - break; + /* Cleanup stale and orphan index entries */ + if (err && (err == -ESTALE || err == -ENOENT)) err = ovl_cleanup(dir, index); - if (err) - break; - } + if (err) + break; + dput(index); index = NULL; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 092d150643c1..f5738e96a052 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -174,6 +174,9 @@ static struct inode *ovl_alloc_inode(struct super_block *sb) { struct ovl_inode *oi = kmem_cache_alloc(ovl_inode_cachep, GFP_KERNEL); + if (!oi) + return NULL; + oi->cache = NULL; oi->redirect = NULL; oi->version = 0; diff --git a/fs/pipe.c b/fs/pipe.c index 97e5be897753..349c9d56d4b3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/pipe.c * diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 12c6922c913c..f7456c4e7d0f 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux proc filesystem routines. # diff --git a/fs/proc/array.c b/fs/proc/array.c index 77a8eacbe032..6f6fc1672ad1 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/array.c * @@ -137,7 +138,7 @@ static const char * const task_state_array[] = { static inline const char *get_task_state(struct task_struct *tsk) { BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); - return task_state_array[__get_task_state(tsk)]; + return task_state_array[task_state_index(tsk)]; } static inline int get_task_umask(struct task_struct *tsk) @@ -453,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, cutime = sig->cutime; cstime = sig->cstime; cgtime = sig->cgtime; - rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); + rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); /* add up live thread stats at the group level */ if (whole) { diff --git a/fs/proc/base.c b/fs/proc/base.c index ad3b0762cc3e..9d357b2ea6cb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/base.c * diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index cbd82dff7e81..403cbb12a6e9 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 06f4d31e0396..e0f867cd8553 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> diff --git a/fs/proc/devices.c b/fs/proc/devices.c index e5709343feb7..2c7f22b14489 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> diff --git a/fs/proc/fd.c b/fs/proc/fd.c index c330495c3115..96fc70225e54 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/dcache.h> diff --git a/fs/proc/fd.h b/fs/proc/fd.h index 46dafadd0083..f371a602bf58 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __PROCFS_FD_H__ #define __PROCFS_FD_H__ diff --git a/fs/proc/inode.c b/fs/proc/inode.c index e250910cffc8..225f541f7078 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/inode.c * diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index a352d5703b41..6a6bee9c603c 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 45629f4b5402..4bc85cb8be6a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/proc/kcore.c kernel ELF core dumper * diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index f9387bb7631b..e0f8774acd65 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/kmsg.c * diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 983fce5c2418..9bc5c58c00ee 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/pid_namespace.h> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index cdd979724c74..6bb20f864259 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 3803b24ca220..59b17e509f46 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/nsproxy.h> #include <linux/ptrace.h> diff --git a/fs/proc/page.c b/fs/proc/page.c index 2726536489b1..1491918a33c3 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bootmem.h> #include <linux/compiler.h> #include <linux/fs.h> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 8f479229b349..c5cbbdff3c3d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * /proc/sys support */ diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 901bd06f437d..d0cf1c50bb6c 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * proc_tty.c -- handles /proc/tty * @@ -14,6 +15,7 @@ #include <linux/tty.h> #include <linux/seq_file.h> #include <linux/bitops.h> +#include "internal.h" /* * The /proc/tty directory inodes... @@ -164,7 +166,7 @@ void proc_tty_unregister_driver(struct tty_driver *driver) if (!ent) return; - remove_proc_entry(driver->driver_name, proc_tty_driver); + remove_proc_entry(ent->name, proc_tty_driver); driver->proc_entry = NULL; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 926fb27f4ca2..4e42aba97f2e 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/root.c * diff --git a/fs/proc/self.c b/fs/proc/self.c index 39857f6db5cf..31326bb23b8b 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index ad8a77f94beb..24072cc06e65 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/kernel_stat.h> #include <linux/proc_fs.h> diff --git a/fs/proc/stat.c b/fs/proc/stat.c index bd4e55f4aa20..59749dfaef67 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> #include <linux/fs.h> #include <linux/init.h> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5589b4bd4b85..6744bd706ecf 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/vmacache.h> #include <linux/hugetlb.h> @@ -1310,13 +1311,15 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; struct page *page = NULL; - if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) + if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; if (pmd_present(pmd)) { page = pmd_page(pmd); flags |= PM_PRESENT; + if (pmd_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; if (pm->show_pfn) frame = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1328,6 +1331,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; VM_BUG_ON(!is_pmd_migration_entry(pmd)); page = migration_entry_to_page(entry); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index b00b766098fa..5b62f57bd9bc 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/file.h> diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 20614b62a9b7..b813e3b529f2 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 7981c4ffe787..95a708d83721 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> diff --git a/fs/proc/version.c b/fs/proc/version.c index d2154eb6d78f..94901e8e700d 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 99dff222fe67..7b635d173213 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/proc_namespace.c - handling of /proc/<pid>/{mounts,mountinfo,mountstats} * @@ -27,7 +28,7 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) poll_wait(file, &p->ns->poll, wait); - event = ACCESS_ONCE(ns->event); + event = READ_ONCE(ns->event); if (m->poll_event != event) { m->poll_event = event; res |= POLLERR | POLLPRI; diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index b8803cc07fce..967b5891f325 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux pstorefs routines. # diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 7f4e48c8d188..c029314478fa 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __PSTORE_INTERNAL_H__ #define __PSTORE_INTERNAL_H__ diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 2b21d180157c..086e491faf04 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -62,7 +62,7 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " static int pstore_new_entry; static void pstore_timefunc(unsigned long); -static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0); +static DEFINE_TIMER(pstore_timer, pstore_timefunc); static void pstore_dowork(struct work_struct *); static DECLARE_WORK(pstore_work, pstore_dowork); @@ -482,10 +482,7 @@ void pstore_record_init(struct pstore_record *record, record->psi = psinfo; /* Report zeroed timestamp if called before timekeeping has resumed. */ - if (__getnstimeofday(&record->time)) { - record->time.tv_sec = 0; - record->time.tv_nsec = 0; - } + record->time = ns_to_timespec(ktime_get_real_fast_ns()); } /* diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c index 76a7a697b778..163afc4ba4b2 100644 --- a/fs/qnx4/bitmap.c +++ b/fs/qnx4/bitmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * QNX4 file system, Linux implementation. * diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 781056a0480f..a6ee23aadd28 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * QNX4 file system, Linux implementation. * diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c index e62c8183777a..eca27878079d 100644 --- a/fs/qnx4/namei.c +++ b/fs/qnx4/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * QNX4 file system, Linux implementation. * diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h index c9b1be2c164d..6283705466a4 100644 --- a/fs/qnx4/qnx4.h +++ b/fs/qnx4/qnx4.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fs.h> #include <linux/qnx4_fs.h> diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 27637e0bdc9f..c1cfb8a19e9d 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * QNX6 file system, Linux implementation. * diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c index 6c1a323137dd..72c2770830be 100644 --- a/fs/qnx6/namei.c +++ b/fs/qnx6/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * QNX6 file system, Linux implementation. * diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h index f23b5c4a66ad..34a6b126a3a9 100644 --- a/fs/qnx6/qnx6.h +++ b/fs/qnx6/qnx6.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * QNX6 file system, Linux implementation. * diff --git a/fs/qnx6/super_mmi.c b/fs/qnx6/super_mmi.c index 62aaf3e3126a..d282c2c73404 100644 --- a/fs/qnx6/super_mmi.c +++ b/fs/qnx6/super_mmi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * QNX6 file system, Linux implementation. * diff --git a/fs/quota/Makefile b/fs/quota/Makefile index c66c37cdaa39..f2b49d0f0287 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o diff --git a/fs/quota/compat.c b/fs/quota/compat.c index fb1892fe3e56..779caed4f078 100644 --- a/fs/quota/compat.c +++ b/fs/quota/compat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/syscalls.h> #include <linux/compat.h> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 50b0556a124f..9f78b5015f2e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Implementation of the diskquota system for the LINUX operating system. QUOTA * is implemented using the BSD system call interface as the means of @@ -1297,21 +1298,18 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space, spin_lock(&dquot->dq_dqb_lock); if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - goto add; + goto finish; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + space + rsv_space; - if (flags & DQUOT_SPACE_NOFAIL) - goto add; - if (dquot->dq_dqb.dqb_bhardlimit && tspace > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); ret = -EDQUOT; - goto out; + goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1322,7 +1320,7 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space, if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); ret = -EDQUOT; - goto out; + goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1338,13 +1336,21 @@ static int dquot_add_space(struct dquot *dquot, qsize_t space, * be always printed */ ret = -EDQUOT; - goto out; + goto finish; } } -add: - dquot->dq_dqb.dqb_rsvspace += rsv_space; - dquot->dq_dqb.dqb_curspace += space; -out: +finish: + /* + * We have to be careful and go through warning generation & grace time + * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it + * only here... + */ + if (flags & DQUOT_SPACE_NOFAIL) + ret = 0; + if (!ret) { + dquot->dq_dqb.dqb_rsvspace += rsv_space; + dquot->dq_dqb.dqb_curspace += space; + } spin_unlock(&dquot->dq_dqb_lock); return ret; } diff --git a/fs/quota/kqid.c b/fs/quota/kqid.c index ebc5e6285800..f814fa90af38 100644 --- a/fs/quota/kqid.c +++ b/fs/quota/kqid.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/quota.h> #include <linux/export.h> diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index e99b1a72d9a7..95acdae391b4 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/cred.h> #include <linux/init.h> #include <linux/kernel.h> diff --git a/fs/quota/quota.c b/fs/quota/quota.c index a9c5dfe6b83e..43612e2a73af 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Quota code necessary even when VFS quota support is not compiled * into the kernel. The interesting stuff is over in dquot.c, here diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h index a1ab8db81a51..31cf27e0e9e0 100644 --- a/fs/quota/quota_tree.h +++ b/fs/quota/quota_tree.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions of structures for vfsv0 quota format */ diff --git a/fs/quota/quotaio_v1.h b/fs/quota/quotaio_v1.h index 746654b5de70..bd11e2c08119 100644 --- a/fs/quota/quotaio_v1.h +++ b/fs/quota/quotaio_v1.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_QUOTAIO_V1_H #define _LINUX_QUOTAIO_V1_H diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h index 4e95430093d9..43cf0f0e2902 100644 --- a/fs/quota/quotaio_v2.h +++ b/fs/quota/quotaio_v2.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions of structures for vfsv0 quota format */ diff --git a/fs/read_write.c b/fs/read_write.c index f0d4b16873e8..0046d72efe94 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * diff --git a/fs/readdir.c b/fs/readdir.c index 89659549c09d..1b83b0ad183b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/readdir.c * @@ -36,13 +37,12 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; - if (shared) { - inode_lock_shared(inode); - } else { + if (shared) + res = down_read_killable(&inode->i_rwsem); + else res = down_write_killable(&inode->i_rwsem); - if (res) - goto out; - } + if (res) + goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile index 3c3b00165114..a39a562c1c10 100644 --- a/fs/reiserfs/Makefile +++ b/fs/reiserfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux reiser-filesystem routines. # diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index 4a211f5b34b8..0c1c847f992f 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/init.h> #include <linux/posix_acl.h> diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index f59c667df15b..69ff280bdfe8 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Write ahead logging implementation copyright Chris Mason 2000 * diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c index 045b83ef9fd9..46bd7bd63a71 100644 --- a/fs/reiserfs/lock.c +++ b/fs/reiserfs/lock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "reiserfs.h" #include <linux/mutex.h> diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 1d34377fef97..48835a659948 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for * licensing and copyright details diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index 2d5489b0a269..b0ae088dffc7 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright * details diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e87aa21c30de..46492fb37a4c 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/reiserfs/xattr.c * diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index 613ff5aef94e..c764352447ba 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/reiserfs_xattr.h> #include <linux/init.h> #include <linux/list.h> diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 54415f0e3d18..aa9380bac196 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/capability.h> #include <linux/fs.h> #include <linux/posix_acl.h> diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index e4cbb7719906..20be9a0e5870 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "reiserfs.h" #include <linux/errno.h> #include <linux/fs.h> diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index f15a5f9e84ce..5ed48da3d02b 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "reiserfs.h" #include <linux/capability.h> #include <linux/errno.h> diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index dc59df43b2db..a573ca45bacc 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "reiserfs.h" #include <linux/errno.h> #include <linux/fs.h> diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile index 420beb7d495c..844928f15711 100644 --- a/fs/romfs/Makefile +++ b/fs/romfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux RomFS filesystem routines. # diff --git a/fs/select.c b/fs/select.c index c6362e38ae92..063067e606ca 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * diff --git a/fs/seq_file.c b/fs/seq_file.c index dc7c2be963ed..4be761c1a03d 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/seq_file.c * diff --git a/fs/signalfd.c b/fs/signalfd.c index d2c434112f42..1c667af86da5 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/signalfd.c * diff --git a/fs/splice.c b/fs/splice.c index f3084cce0ea6..39e2dc01ac12 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -253,7 +253,7 @@ EXPORT_SYMBOL(add_to_pipe); */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { - unsigned int buffers = ACCESS_ONCE(pipe->buffers); + unsigned int buffers = READ_ONCE(pipe->buffers); spd->nr_pages_max = buffers; if (buffers <= PIPE_DEF_BUFFERS) diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 6655631c53ae..7bd9b8b856d0 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux squashfs routines. # diff --git a/fs/stat.c b/fs/stat.c index 8a6aa8caf891..873785dae022 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/stat.c * diff --git a/fs/statfs.c b/fs/statfs.c index fab9b6a3c116..c25dd9a26cc1 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> diff --git a/fs/super.c b/fs/super.c index 166c4ee0d0ed..994db21f59bf 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/super.c * diff --git a/fs/sync.c b/fs/sync.c index a576aa2e6b09..83ac79a960dd 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * High-level sync()-related operations */ diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c index 862c1f74a583..0e69dbdf7277 100644 --- a/fs/sysv/balloc.c +++ b/fs/sysv/balloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/sysv/balloc.c * diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index f5191cb2c947..88e38cd8f5c9 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/sysv/dir.c * diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 7ba997e31aeb..45fc79a18594 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/sysv/file.c * diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index eb963fbb7903..6c9801986af6 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/sysv/ialloc.c * diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 1c8bf9453a71..3c47b7d5d4cf 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/sysv/inode.c * diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 83809f5b5eca..bcb67b0cabe7 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/sysv/itree.c * diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index d8817f139763..250b0755b908 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/sysv/namei.c * diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 1e7e27c729af..e913698779c0 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SYSV_H #define _SYSV_H diff --git a/fs/timerfd.c b/fs/timerfd.c index ece0c02d7e63..040612ec9598 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/timerfd.c * diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index 6f3251c2bf08..9758f709c736 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_UBIFS_FS) += ubifs.o ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 114ba455bac3..616a688f5d8f 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "ubifs.h" static int ubifs_crypt_get_context(struct inode *inode, void *ctx, size_t len) @@ -87,7 +88,6 @@ const struct fscrypt_operations ubifs_crypt_operations = { .key_prefix = "ubifs:", .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, - .is_encrypted = __ubifs_crypt_is_encrypted, .empty_dir = ubifs_crypt_empty_dir, .max_namelen = ubifs_crypt_max_namelen, }; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index fdc311246807..0164bcc827f8 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -38,7 +38,8 @@ void ubifs_set_inode_flags(struct inode *inode) { unsigned int flags = ubifs_inode(inode)->flags; - inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC); + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC | + S_ENCRYPTED); if (flags & UBIFS_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & UBIFS_APPEND_FL) @@ -47,6 +48,8 @@ void ubifs_set_inode_flags(struct inode *inode) inode->i_flags |= S_IMMUTABLE; if (flags & UBIFS_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; + if (flags & UBIFS_CRYPT_FL) + inode->i_flags |= S_ENCRYPTED; } /* diff --git a/fs/ubifs/misc.c b/fs/ubifs/misc.c index 486a2844949f..586fd5b578a7 100644 --- a/fs/ubifs/misc.c +++ b/fs/ubifs/misc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include "ubifs.h" diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 5496b17b959c..7503e7cdf870 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2007,12 +2007,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) return c; } -#ifndef CONFIG_UBIFS_FS_ENCRYPTION -const struct fscrypt_operations ubifs_crypt_operations = { - .is_encrypted = __ubifs_crypt_is_encrypted, -}; -#endif - static int ubifs_fill_super(struct super_block *sb, void *data, int silent) { struct ubifs_info *c = sb->s_fs_info; @@ -2055,7 +2049,9 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; sb->s_op = &ubifs_super_operations; sb->s_xattr = ubifs_xattr_handlers; +#ifdef CONFIG_UBIFS_FS_ENCRYPTION sb->s_cop = &ubifs_crypt_operations; +#endif mutex_lock(&c->umount_mutex); err = mount_ubifs(c); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index cd43651f1731..63c7468147eb 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -38,12 +38,11 @@ #include <linux/backing-dev.h> #include <linux/security.h> #include <linux/xattr.h> -#ifdef CONFIG_UBIFS_FS_ENCRYPTION -#include <linux/fscrypt_supp.h> -#else -#include <linux/fscrypt_notsupp.h> -#endif #include <linux/random.h> + +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION) +#include <linux/fscrypt.h> + #include "ubifs-media.h" /* Version of this UBIFS implementation */ @@ -1835,18 +1834,13 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, extern const struct fscrypt_operations ubifs_crypt_operations; -static inline bool __ubifs_crypt_is_encrypted(struct inode *inode) +static inline bool ubifs_crypt_is_encrypted(const struct inode *inode) { - struct ubifs_inode *ui = ubifs_inode(inode); + const struct ubifs_inode *ui = ubifs_inode(inode); return ui->flags & UBIFS_CRYPT_FL; } -static inline bool ubifs_crypt_is_encrypted(const struct inode *inode) -{ - return __ubifs_crypt_is_encrypted((struct inode *)inode); -} - /* Normal UBIFS messages */ __printf(2, 3) void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index c13eae819cbc..5ddc89d564fd 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -170,6 +170,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, err = ubifs_jnl_update(c, host, nm, inode, 0, 1); if (err) goto out_cancel; + ubifs_set_inode_flags(host); mutex_unlock(&host_ui->ui_mutex); ubifs_release_budget(c, &req); diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index b1b9a63d8cf3..630426ffb775 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _UDF_I_H #define _UDF_I_H diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index c13875d669c0..68c9f1d618f5 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_UDF_SB_H #define __LINUX_UDF_SB_H diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 63b034984378..fa206558128d 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UDF_DECL_H #define __UDF_DECL_H diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h index 6a9f3a9cc428..a4363ac2cfeb 100644 --- a/fs/udf/udfend.h +++ b/fs/udf/udfend.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UDF_ENDIAN_H #define __UDF_ENDIAN_H diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index f80be4c5df9d..b5cd79065ef9 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ufs/balloc.c * diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c index b4676322ddb6..1abe5454de47 100644 --- a/fs/ufs/cylinder.c +++ b/fs/ufs/cylinder.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ufs/cylinder.c * diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 48609f1d9580..2edc1755b7c5 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ufs/ufs_dir.c * diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 042ddbf110cc..7e087581be7e 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ufs/file.c * diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index d1dd8cc33179..916b4a428933 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ufs/ialloc.c * diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index f36d6a53687d..afb601c0dda0 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ufs/inode.c * diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 8eca4eda8450..32545cd00ceb 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ufs/namei.c * diff --git a/fs/ufs/swab.h b/fs/ufs/swab.h index 8d974c4fd18b..a0e1d8c827f4 100644 --- a/fs/ufs/swab.h +++ b/fs/ufs/swab.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/ufs/swab.h * diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index c87f4c3fa9dd..b49e0efdf3d7 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _UFS_UFS_H #define _UFS_UFS_H 1 diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 150eef6f1233..ef9ead44776a 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/ufs_fs.h * diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 02497a492eb2..4fa633f84274 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ufs/util.c * diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 9fc7119a1551..1907be6d5808 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/ufs/util.h * diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1c713fd5b3e6..f46d133c0949 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -381,7 +381,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) * in __get_user_pages if userfaultfd_release waits on the * caller of handle_userfault to release the mmap_sem. */ - if (unlikely(ACCESS_ONCE(ctx->released))) { + if (unlikely(READ_ONCE(ctx->released))) { /* * Don't return VM_FAULT_SIGBUS in this case, so a non * cooperative manager can close the uffd after the @@ -477,7 +477,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) vmf->flags, reason); up_read(&mm->mmap_sem); - if (likely(must_wait && !ACCESS_ONCE(ctx->released) && + if (likely(must_wait && !READ_ONCE(ctx->released) && (return_to_userland ? !signal_pending(current) : !fatal_signal_pending(current)))) { wake_up_poll(&ctx->fd_wqh, POLLIN); @@ -586,7 +586,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, set_current_state(TASK_KILLABLE); if (ewq->msg.event == 0) break; - if (ACCESS_ONCE(ctx->released) || + if (READ_ONCE(ctx->released) || fatal_signal_pending(current)) { /* * &ewq->wq may be queued in fork_event, but @@ -833,7 +833,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - ACCESS_ONCE(ctx->released) = true; + WRITE_ONCE(ctx->released, true); if (!mmget_not_zero(mm)) goto wakeup; diff --git a/fs/utimes.c b/fs/utimes.c index 51edb9f9507c..e4b3d7c2c9f5 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 1b98cfa342ab..f42fcf1b5465 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -71,6 +71,23 @@ config XFS_RT If unsure, say N. +config XFS_ONLINE_SCRUB + bool "XFS online metadata check support" + default n + depends on XFS_FS + help + If you say Y here you will be able to check metadata on a + mounted XFS filesystem. This feature is intended to reduce + filesystem downtime by supplementing xfs_repair. The key + advantage here is to look for problems proactively so that + they can be dealt with in a controlled manner. + + This feature is considered EXPERIMENTAL. Use with caution! + + See the xfs_scrub man page in section 8 for additional information. + + If unsure, say N. + config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index a6e955bfead8..7ceb41a9786a 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -49,6 +49,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_dquot_buf.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ + xfs_iext_tree.o \ xfs_inode_fork.o \ xfs_inode_buf.o \ xfs_log_rlimit.o \ @@ -135,3 +136,31 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o + +# online scrub/repair +ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) + +# Tracepoints like to blow up, so build that before everything else + +xfs-y += $(addprefix scrub/, \ + trace.o \ + agheader.o \ + alloc.o \ + attr.o \ + bmap.o \ + btree.o \ + common.o \ + dabtree.o \ + dir.o \ + ialloc.o \ + inode.o \ + parent.o \ + refcount.o \ + rmap.o \ + scrub.o \ + symlink.o \ + ) + +xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o +xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o +endif diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 4d85992d75b2..758f37ac5ad3 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -119,8 +119,7 @@ kmem_zone_free(kmem_zone_t *zone, void *ptr) static inline void kmem_zone_destroy(kmem_zone_t *zone) { - if (zone) - kmem_cache_destroy(zone); + kmem_cache_destroy(zone); } extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index df3e600835e8..2291f4224e24 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -27,6 +27,7 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_alloc.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 744dcaec34cc..0da80019a917 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -31,6 +31,7 @@ #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_cksum.h" #include "xfs_trace.h" @@ -1584,6 +1585,10 @@ xfs_alloc_ag_vextent_small( bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno, 0); + if (!bp) { + error = -EFSCORRUPTED; + goto error0; + } xfs_trans_binval(args->tp, bp); } args->len = 1; @@ -2141,6 +2146,10 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); + if (!bp) { + error = -EFSCORRUPTED; + goto out_agbp_relse; + } xfs_trans_binval(tp, bp); } @@ -2923,3 +2932,52 @@ xfs_alloc_query_all( query.fn = fn; return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); } + +/* Find the size of the AG, in blocks. */ +xfs_agblock_t +xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + ASSERT(agno < mp->m_sb.sb_agcount); + + if (agno < mp->m_sb.sb_agcount - 1) + return mp->m_sb.sb_agblocks; + return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks); +} + +/* + * Verify that an AG block number pointer neither points outside the AG + * nor points at static metadata. + */ +bool +xfs_verify_agbno( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t agbno) +{ + xfs_agblock_t eoag; + + eoag = xfs_ag_block_count(mp, agno); + if (agbno >= eoag) + return false; + if (agbno <= XFS_AGFL_BLOCK(mp)) + return false; + return true; +} + +/* + * Verify that an FS block number pointer neither points outside the + * filesystem nor points at static AG metadata. + */ +bool +xfs_verify_fsbno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno) +{ + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); + + if (agno >= mp->m_sb.sb_agcount) + return false; + return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index ef26edc2e938..7ba2d129d504 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -232,5 +232,9 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); +xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); +bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno); +bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 5c16db86b38f..53cc8b986eac 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -397,13 +397,9 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) /* rounded down */ offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; - switch (dp->i_d.di_format) { - case XFS_DINODE_FMT_DEV: + if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; return (offset >= minforkoff) ? minforkoff : 0; - case XFS_DINODE_FMT_UUID: - minforkoff = roundup(sizeof(uuid_t), 8) >> 3; - return (offset >= minforkoff) ? minforkoff : 0; } /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 044a363119be..08df809e2315 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -38,6 +38,7 @@ #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_rtalloc.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" @@ -112,28 +113,21 @@ xfs_bmap_compute_maxlevels( STATIC int /* error */ xfs_bmbt_lookup_eq( struct xfs_btree_cur *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, + struct xfs_bmbt_irec *irec, int *stat) /* success/failure */ { - cur->bc_rec.b.br_startoff = off; - cur->bc_rec.b.br_startblock = bno; - cur->bc_rec.b.br_blockcount = len; + cur->bc_rec.b = *irec; return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); } STATIC int /* error */ -xfs_bmbt_lookup_ge( +xfs_bmbt_lookup_first( struct xfs_btree_cur *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, int *stat) /* success/failure */ { - cur->bc_rec.b.br_startoff = off; - cur->bc_rec.b.br_startblock = bno; - cur->bc_rec.b.br_blockcount = len; + cur->bc_rec.b.br_startoff = 0; + cur->bc_rec.b.br_startblock = 0; + cur->bc_rec.b.br_blockcount = 0; return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } @@ -160,21 +154,17 @@ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) } /* - * Update the record referred to by cur to the value given - * by [off, bno, len, state]. + * Update the record referred to by cur to the value given by irec * This either works (return 0) or gets an EFSCORRUPTED error. */ STATIC int xfs_bmbt_update( struct xfs_btree_cur *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, - xfs_exntst_t state) + struct xfs_bmbt_irec *irec) { union xfs_btree_rec rec; - xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state); + xfs_bmbt_disk_set_all(&rec.bmbt, irec); return xfs_btree_update(cur, &rec); } @@ -242,7 +232,6 @@ xfs_bmap_forkoff_reset( { if (whichfork == XFS_ATTR_FORK && ip->i_d.di_format != XFS_DINODE_FMT_DEV && - ip->i_d.di_format != XFS_DINODE_FMT_UUID && ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; @@ -499,31 +488,6 @@ error_norelse: } /* - * Add bmap trace insert entries for all the contents of the extent records. - */ -void -xfs_bmap_trace_exlist( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr or cow fork */ - unsigned long caller_ip) -{ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int state = 0; - - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - else if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == xfs_iext_count(ifp)); - for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, state, caller_ip); -} - -/* * Validate that the bmbt_irecs being returned from bmapi are valid * given the caller's original parameters. Specifically check the * ranges of the returned irecs to ensure that they only extend beyond @@ -657,8 +621,8 @@ xfs_bmap_btree_to_extents( cbno = be64_to_cpu(*pp); *logflagsp = 0; #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, cbno, 1))) - return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, + xfs_btree_check_lptr(cur, cbno, 1)); #endif error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); @@ -703,14 +667,14 @@ xfs_bmap_extents_to_btree( xfs_bmbt_rec_t *arp; /* child record pointer */ struct xfs_btree_block *block; /* btree root block */ xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return value */ - xfs_extnum_t i, cnt; /* extent record index */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_key_t *kp; /* root block key pointer */ xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ xfs_bmbt_ptr_t *pp; /* root block address pointer */ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec rec; + xfs_extnum_t cnt = 0; mp = ip->i_mount; ASSERT(whichfork != XFS_COW_FORK); @@ -789,15 +753,12 @@ xfs_bmap_extents_to_btree( XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - nextents = xfs_iext_count(ifp); - for (cnt = i = 0; i < nextents; i++) { - ep = xfs_iext_get_ext(ifp, i); - if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { - arp->l0 = cpu_to_be64(ep->l0); - arp->l1 = cpu_to_be64(ep->l1); - arp++; cnt++; - } + for_each_xfs_iext(ifp, &icur, &rec) { + if (isnullstartblock(rec.br_startblock)) + continue; + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt); + xfs_bmbt_disk_set_all(arp, &rec); + cnt++; } ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); xfs_btree_set_numrecs(ablock, cnt); @@ -845,6 +806,8 @@ xfs_bmap_local_to_extents_empty( xfs_bmap_forkoff_reset(ip, whichfork); ifp->if_flags &= ~XFS_IFINLINE; ifp->if_flags |= XFS_IFEXTENTS; + ifp->if_u1.if_root = NULL; + ifp->if_height = 0; XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); } @@ -868,6 +831,7 @@ xfs_bmap_local_to_extents( xfs_alloc_arg_t args; /* allocation arguments */ xfs_buf_t *bp; /* buffer for extent block */ struct xfs_bmbt_irec rec; + struct xfs_iext_cursor icur; /* * We don't want to deal with the case of keeping inode data inline yet. @@ -885,8 +849,7 @@ xfs_bmap_local_to_extents( flags = 0; error = 0; - ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == - XFS_IFINLINE); + ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS)) == XFS_IFINLINE); memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; @@ -930,15 +893,16 @@ xfs_bmap_local_to_extents( xfs_bmap_local_to_extents_empty(ip, whichfork); flags |= XFS_ILOG_CORE; + ifp->if_u1.if_root = NULL; + ifp->if_height = 0; + rec.br_startoff = 0; rec.br_startblock = args.fsbno; rec.br_blockcount = 1; rec.br_state = XFS_EXT_NORM; - xfs_iext_insert(ip, 0, 1, &rec, 0); + xfs_iext_first(ifp, &icur); + xfs_iext_insert(ip, &icur, &rec, 0); - trace_xfs_bmap_post_update(ip, 0, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, - _THIS_IP_); XFS_IFORK_NEXT_SET(ip, whichfork, 1); ip->i_d.di_nblocks = 1; xfs_trans_mod_dquot_byino(tp, ip, @@ -973,7 +937,8 @@ xfs_bmap_add_attrfork_btree( cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); cur->bc_private.b.dfops = dfops; cur->bc_private.b.firstblock = *firstblock; - if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) + error = xfs_bmbt_lookup_first(cur, &stat); + if (error) goto error0; /* must be at least one entry */ XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0); @@ -1124,9 +1089,6 @@ xfs_bmap_add_attrfork( case XFS_DINODE_FMT_DEV: ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; break; - case XFS_DINODE_FMT_UUID: - ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; - break; case XFS_DINODE_FMT_LOCAL: case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -1206,32 +1168,35 @@ trans_cancel: */ /* - * Read in the extents to if_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. If the file system cannot contain unwritten - * extents, the records are checked for no "state" flags. + * Read in extents from a btree-format inode. */ -int /* error */ -xfs_bmap_read_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ +int +xfs_iread_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) { - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_extnum_t i, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - /* REFERENCED */ - xfs_extnum_t room; /* number of entries there's room for */ + struct xfs_mount *mp = ip->i_mount; + int state = xfs_bmap_fork_to_state(whichfork); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_extnum_t nextents = XFS_IFORK_NEXTENTS(ip, whichfork); + struct xfs_btree_block *block = ifp->if_broot; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec new; + xfs_fsblock_t bno; + struct xfs_buf *bp; + xfs_extnum_t i, j; + int level; + __be64 *pp; + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - block = ifp->if_broot; /* * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. */ @@ -1248,21 +1213,23 @@ xfs_bmap_read_extents( error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) - return error; + goto out; block = XFS_BUF_TO_BLOCK(bp); if (level == 0) break; pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); XFS_WANT_CORRUPTED_GOTO(mp, - XFS_FSB_SANITY_CHECK(mp, bno), error0); + XFS_FSB_SANITY_CHECK(mp, bno), out_brelse); xfs_trans_brelse(tp, bp); } + /* * Here with bp and block set to the leftmost leaf node in the tree. */ - room = xfs_iext_count(ifp); i = 0; + xfs_iext_first(ifp, &icur); + /* * Loop over all leaf nodes. Copy information to the extent records. */ @@ -1272,14 +1239,15 @@ xfs_bmap_read_extents( xfs_extnum_t num_recs; num_recs = xfs_btree_get_numrecs(block); - if (unlikely(i + num_recs > room)) { - ASSERT(i + num_recs <= room); + if (unlikely(i + num_recs > nextents)) { + ASSERT(i + num_recs <= nextents); xfs_warn(ip->i_mount, "corrupt dinode %Lu, (btree extents).", (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, ip->i_mount, block); - goto error0; + error = -EFSCORRUPTED; + goto out_brelse; } /* * Read-ahead the next leaf block, if any. @@ -1292,15 +1260,17 @@ xfs_bmap_read_extents( * Copy records into the extent records. */ frp = XFS_BMBT_REC_ADDR(mp, block, 1); - for (j = 0; j < num_recs; j++, i++, frp++) { - xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); - trp->l0 = be64_to_cpu(frp->l0); - trp->l1 = be64_to_cpu(frp->l1); - if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) { + for (j = 0; j < num_recs; j++, frp++, i++) { + xfs_bmbt_disk_get_all(frp, &new); + if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) { XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", XFS_ERRLEVEL_LOW, mp); - goto error0; + error = -EFSCORRUPTED; + goto out_brelse; } + xfs_iext_insert(ip, &icur, &new, state); + trace_xfs_read_extent(ip, &icur, state, _THIS_IP_); + xfs_iext_next(ifp, &icur); } xfs_trans_brelse(tp, bp); bno = nextbno; @@ -1312,71 +1282,74 @@ xfs_bmap_read_extents( error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) - return error; + goto out; block = XFS_BUF_TO_BLOCK(bp); } - if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) - return -EFSCORRUPTED; + + if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) { + error = -EFSCORRUPTED; + goto out; + } ASSERT(i == xfs_iext_count(ifp)); - XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); + + ifp->if_flags |= XFS_IFEXTENTS; return 0; -error0: + +out_brelse: xfs_trans_brelse(tp, bp); - return -EFSCORRUPTED; +out: + xfs_iext_destroy(ifp); + return error; } /* - * Returns the file-relative block number of the first unused block(s) - * in the file with at least "len" logically contiguous blocks free. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - * Return 0 if the file is currently local (in-inode). + * Returns the relative block number of the first unused block(s) in the given + * fork with at least "len" logically contiguous blocks free. This is the + * lowest-address hole if the fork has holes, else the first block past the end + * of fork. Return 0 if the fork is currently local (in-inode). */ int /* error */ xfs_bmap_first_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *first_unused, /* unused block */ - int whichfork) /* data or attr fork */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ { - int error; /* error return value */ - int idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_fileoff_t lastaddr; /* last block number seen */ - xfs_fileoff_t lowest; /* lowest useful block */ - xfs_fileoff_t max; /* starting useful block */ - xfs_extnum_t nextents; /* number of extent entries */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + xfs_fileoff_t lastaddr = 0; + xfs_fileoff_t lowest, max; + int error; ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { *first_unused = 0; return 0; } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - lowest = *first_unused; - nextents = xfs_iext_count(ifp); - for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - struct xfs_bmbt_irec got; - xfs_iext_get_extent(ifp, idx, &got); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + lowest = max = *first_unused; + for_each_xfs_iext(ifp, &icur, &got) { /* * See if the hole before this extent will work. */ if (got.br_startoff >= lowest + len && - got.br_startoff - max >= len) { - *first_unused = max; - return 0; - } + got.br_startoff - max >= len) + break; lastaddr = got.br_startoff + got.br_blockcount; max = XFS_FILEOFF_MAX(lastaddr, lowest); } + *first_unused = max; return 0; } @@ -1396,7 +1369,7 @@ xfs_bmap_last_before( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; int error; switch (XFS_IFORK_FORMAT(ip, whichfork)) { @@ -1416,17 +1389,8 @@ xfs_bmap_last_before( return error; } - if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) { - if (got.br_startoff <= *last_block - 1) - return 0; - } - - if (xfs_iext_get_extent(ifp, idx - 1, &got)) { - *last_block = got.br_startoff + got.br_blockcount; - return 0; - } - - *last_block = 0; + if (!xfs_iext_lookup_extent_before(ip, ifp, last_block, &icur, &got)) + *last_block = 0; return 0; } @@ -1439,8 +1403,8 @@ xfs_bmap_last_extent( int *is_empty) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_iext_cursor icur; int error; - int nextents; if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); @@ -1448,14 +1412,11 @@ xfs_bmap_last_extent( return error; } - nextents = xfs_iext_count(ifp); - if (nextents == 0) { + xfs_iext_last(ifp, &icur); + if (!xfs_iext_get_extent(ifp, &icur, rec)) *is_empty = 1; - return 0; - } - - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); - *is_empty = 0; + else + *is_empty = 0; return 0; } @@ -1477,14 +1438,14 @@ xfs_bmap_isaeof( int is_empty; int error; - bma->aeof = 0; + bma->aeof = false; error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, &is_empty); if (error) return error; if (is_empty) { - bma->aeof = 1; + bma->aeof = true; return 0; } @@ -1540,10 +1501,10 @@ xfs_bmap_one_block( xfs_inode_t *ip, /* incore inode */ int whichfork) /* data or attr fork */ { - xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ xfs_ifork_t *ifp; /* inode fork pointer */ int rval; /* return value */ xfs_bmbt_irec_t s; /* internal version of extent */ + struct xfs_iext_cursor icur; #ifndef DEBUG if (whichfork == XFS_DATA_FORK) @@ -1555,8 +1516,8 @@ xfs_bmap_one_block( return 0; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_get_all(ep, &s); + xfs_iext_first(ifp, &icur); + xfs_iext_get_extent(ifp, &icur, &s); rval = s.br_startoff == 0 && s.br_blockcount == 1; if (rval && whichfork == XFS_DATA_FORK) ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); @@ -1576,8 +1537,6 @@ xfs_bmap_add_extent_delay_real( int whichfork) { struct xfs_bmbt_irec *new = &bma->got; - int diff; /* temp value */ - xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -1585,14 +1544,14 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = 0;/* state bits, accessed thru macros */ + int state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ - xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ struct xfs_mount *mp; xfs_extnum_t *nextents; + struct xfs_bmbt_irec old; mp = bma->ip->i_mount; ifp = XFS_IFORK_PTR(bma->ip, whichfork); @@ -1600,8 +1559,6 @@ xfs_bmap_add_extent_delay_real( nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : &bma->ip->i_d.di_nextents); - ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); @@ -1612,15 +1569,12 @@ xfs_bmap_add_extent_delay_real( #define RIGHT r[1] #define PREV r[2] - if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - /* * Set up a bunch of variables to make the tests simpler. */ - ep = xfs_iext_get_ext(ifp, bma->idx); - xfs_bmbt_get_all(ep, &PREV); + xfs_iext_get_extent(ifp, &bma->icur, &PREV); new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(isnullstartblock(PREV.br_startblock)); ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); @@ -1640,10 +1594,8 @@ xfs_bmap_add_extent_delay_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (bma->idx > 0) { + if (xfs_iext_peek_prev_extent(ifp, &bma->icur, &LEFT)) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT); - if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -1660,10 +1612,8 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (bma->idx < xfs_iext_count(ifp) - 1) { + if (xfs_iext_peek_next_extent(ifp, &bma->icur, &RIGHT)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); - if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -1693,22 +1643,19 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - bma->idx--; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), - LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - - xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); + LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount; + + xfs_iext_remove(bma->ip, &bma->icur, state); + xfs_iext_remove(bma->ip, &bma->icur, state); + xfs_iext_prev(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); (*nextents)--; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, - RIGHT.br_startblock, - RIGHT.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -1720,11 +1667,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + - PREV.br_blockcount + - RIGHT.br_blockcount, LEFT.br_state); + error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; } @@ -1735,28 +1678,22 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - bma->idx--; + old = LEFT; + LEFT.br_blockcount += PREV.br_blockcount; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), - LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_iext_remove(bma->ip, &bma->icur, state); + xfs_iext_prev(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); - xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, - LEFT.br_startblock, LEFT.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + - PREV.br_blockcount, LEFT.br_state); + error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; } @@ -1767,27 +1704,23 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, new->br_startblock); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + PREV.br_startblock = new->br_startblock; + PREV.br_blockcount += RIGHT.br_blockcount; + + xfs_iext_next(ifp, &bma->icur); + xfs_iext_remove(bma->ip, &bma->icur, state); + xfs_iext_prev(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); - xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, - RIGHT.br_startblock, - RIGHT.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, PREV.br_startoff, - new->br_startblock, - PREV.br_blockcount + - RIGHT.br_blockcount, PREV.br_state); + error = xfs_bmbt_update(bma->cur, &PREV); if (error) goto done; } @@ -1799,23 +1732,19 @@ xfs_bmap_add_extent_delay_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, new->br_startblock); - xfs_bmbt_set_state(ep, new->br_state); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + PREV.br_startblock = new->br_startblock; + PREV.br_state = new->br_state; + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; @@ -1828,40 +1757,33 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1), - LEFT.br_blockcount + new->br_blockcount); - xfs_bmbt_set_startoff(ep, - PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_); - + old = LEFT; temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock)); + + LEFT.br_blockcount += new->br_blockcount; + + PREV.br_blockcount = temp; + PREV.br_startoff += new->br_blockcount; + PREV.br_startblock = nullstartblock(da_new); + + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + xfs_iext_prev(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, - LEFT.br_startblock, LEFT.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + - new->br_blockcount, - LEFT.br_state); + error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; } - da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), - startblockval(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - - bma->idx--; break; case BMAP_LEFT_FILLING: @@ -1869,23 +1791,16 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_startoff(ep, new_endoff); - temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + xfs_iext_update_extent(bma->ip, state, &bma->icur, new); (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; @@ -1900,12 +1815,18 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + + temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, bma->idx + 1); - xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + + PREV.br_startoff = new_endoff; + PREV.br_blockcount = temp; + PREV.br_startblock = nullstartblock(da_new); + xfs_iext_next(ifp, &bma->icur); + xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); + xfs_iext_prev(ifp, &bma->icur); break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1913,40 +1834,34 @@ xfs_bmap_add_extent_delay_real( * Filling in the last part of a previous delayed allocation. * The right neighbor is contiguous with the new allocation. */ - temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1), - new->br_startoff, new->br_startblock, - new->br_blockcount + RIGHT.br_blockcount, - RIGHT.br_state); - trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + old = RIGHT; + RIGHT.br_startoff = new->br_startoff; + RIGHT.br_startblock = new->br_startblock; + RIGHT.br_blockcount += new->br_blockcount; + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, - RIGHT.br_startblock, - RIGHT.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, new->br_startoff, - new->br_startblock, - new->br_blockcount + - RIGHT.br_blockcount, - RIGHT.br_state); + error = xfs_bmbt_update(bma->cur, &RIGHT); if (error) goto done; } + temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - bma->idx++; + PREV.br_blockcount = temp; + PREV.br_startblock = nullstartblock(da_new); + + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + xfs_iext_next(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT); break; case BMAP_RIGHT_FILLING: @@ -1954,22 +1869,16 @@ xfs_bmap_add_extent_delay_real( * Filling in the last part of a previous delayed allocation. * The right neighbor is not contiguous. */ - temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); + xfs_iext_update_extent(bma->ip, state, &bma->icur, new); (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; @@ -1984,14 +1893,16 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + + temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, bma->idx); - xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - bma->idx++; + PREV.br_startblock = nullstartblock(da_new); + PREV.br_blockcount = temp; + xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); + xfs_iext_next(ifp, &bma->icur); break; case 0: @@ -2015,30 +1926,40 @@ xfs_bmap_add_extent_delay_real( * PREV @ idx LEFT RIGHT * inserted at idx + 1 */ - temp = new->br_startoff - PREV.br_startoff; - temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ + old = PREV; + + /* LEFT is the new middle */ LEFT = *new; + + /* RIGHT is the new right */ RIGHT.br_state = PREV.br_state; - RIGHT.br_startblock = nullstartblock( - (int)xfs_bmap_worst_indlen(bma->ip, temp2)); RIGHT.br_startoff = new_endoff; - RIGHT.br_blockcount = temp2; - /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ - xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); + RIGHT.br_blockcount = + PREV.br_startoff + PREV.br_blockcount - new_endoff; + RIGHT.br_startblock = + nullstartblock(xfs_bmap_worst_indlen(bma->ip, + RIGHT.br_blockcount)); + + /* truncate PREV */ + PREV.br_blockcount = new->br_startoff - PREV.br_startoff; + PREV.br_startblock = + nullstartblock(xfs_bmap_worst_indlen(bma->ip, + PREV.br_blockcount)); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + + xfs_iext_next(ifp, &bma->icur); + xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state); + xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state); (*nextents)++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; @@ -2053,30 +1974,9 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } - temp = xfs_bmap_worst_indlen(bma->ip, temp); - temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); - diff = (int)(temp + temp2 - - (startblockval(PREV.br_startblock) - - (bma->cur ? - bma->cur->bc_private.b.allocated : 0))); - if (diff > 0) { - error = xfs_mod_fdblocks(bma->ip->i_mount, - -((int64_t)diff), false); - ASSERT(!error); - if (error) - goto done; - } - ep = xfs_iext_get_ext(ifp, bma->idx); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2), - nullstartblock((int)temp2)); - trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_); - - bma->idx++; - da_new = temp + temp2; + da_new = startblockval(PREV.br_startblock) + + startblockval(RIGHT.br_startblock); break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: @@ -2110,19 +2010,17 @@ xfs_bmap_add_extent_delay_real( goto done; } - /* adjust for changes in reserved delayed indirect blocks */ - if (da_old || da_new) { - temp = da_new; - if (bma->cur) - temp += bma->cur->bc_private.b.allocated; - if (temp < da_old) - xfs_mod_fdblocks(bma->ip->i_mount, - (int64_t)(da_old - temp), false); + if (bma->cur) { + da_new += bma->cur->bc_private.b.allocated; + bma->cur->bc_private.b.allocated = 0; } - /* clear out the allocated field, done with it now in any case. */ - if (bma->cur) - bma->cur->bc_private.b.allocated = 0; + /* adjust for changes in reserved delayed indirect blocks */ + if (da_new != da_old) { + ASSERT(state == 0 || da_new < da_old); + error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), + false); + } xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: @@ -2142,7 +2040,7 @@ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ int whichfork, - xfs_extnum_t *idx, /* extent number to update/insert */ + struct xfs_iext_cursor *icur, xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ xfs_fsblock_t *first, /* pointer to firstblock variable */ @@ -2150,28 +2048,22 @@ xfs_bmap_add_extent_unwritten_real( int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ - xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ - xfs_exntst_t newext; /* new extent state */ - xfs_exntst_t oldext; /* old extent state */ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = 0;/* state bits, accessed thru macros */ + int state = xfs_bmap_fork_to_state(whichfork); struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec old; *logflagsp = 0; cur = *curp; ifp = XFS_IFORK_PTR(ip, whichfork); - if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - ASSERT(*idx >= 0); - ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2184,12 +2076,8 @@ xfs_bmap_add_extent_unwritten_real( * Set up a bunch of variables to make the tests simpler. */ error = 0; - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_get_all(ep, &PREV); - newext = new->br_state; - oldext = (newext == XFS_EXT_UNWRITTEN) ? - XFS_EXT_NORM : XFS_EXT_UNWRITTEN; - ASSERT(PREV.br_state == oldext); + xfs_iext_get_extent(ifp, icur, &PREV); + ASSERT(new->br_state != PREV.br_state); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); @@ -2207,10 +2095,8 @@ xfs_bmap_add_extent_unwritten_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (*idx > 0) { + if (xfs_iext_peek_prev_extent(ifp, icur, &LEFT)) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); - if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -2218,7 +2104,7 @@ xfs_bmap_add_extent_unwritten_real( if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && - LEFT.br_state == newext && + LEFT.br_state == new->br_state && LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) state |= BMAP_LEFT_CONTIG; @@ -2227,9 +2113,8 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < xfs_iext_count(ifp) - 1) { + if (xfs_iext_peek_next_extent(ifp, icur, &RIGHT)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -2237,7 +2122,7 @@ xfs_bmap_add_extent_unwritten_real( if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && - newext == RIGHT.br_state && + new->br_state == RIGHT.br_state && new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != @@ -2258,24 +2143,20 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left and right neighbors are both contiguous with new. */ - --*idx; + LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), - LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - xfs_iext_remove(ip, *idx + 1, 2, state); + xfs_iext_remove(ip, icur, state); + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &LEFT); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, - RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) @@ -2290,10 +2171,8 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount, LEFT.br_state))) + error = xfs_bmbt_update(cur, &LEFT); + if (error) goto done; } break; @@ -2303,23 +2182,19 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - --*idx; - - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), - LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + LEFT.br_blockcount += PREV.br_blockcount; - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &LEFT); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, PREV.br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, &PREV, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) @@ -2328,10 +2203,8 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + PREV.br_blockcount, - LEFT.br_state))) + error = xfs_bmbt_update(cur, &LEFT); + if (error) goto done; } break; @@ -2341,21 +2214,22 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmbt_set_state(ep, newext); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + PREV.br_blockcount += RIGHT.br_blockcount; + PREV.br_state = new->br_state; + + xfs_iext_next(ifp, icur); + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &PREV); + XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, - RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) @@ -2364,10 +2238,8 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, - new->br_startblock, - new->br_blockcount + RIGHT.br_blockcount, - newext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; } break; @@ -2378,22 +2250,19 @@ xfs_bmap_add_extent_unwritten_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_state(ep, newext); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + PREV.br_state = new->br_state; + xfs_iext_update_extent(ip, state, icur, &PREV); if (cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, new, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - newext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; } break; @@ -2403,43 +2272,32 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), - LEFT.br_blockcount + new->br_blockcount); - xfs_bmbt_set_startoff(ep, - PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); - - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, - new->br_startblock + new->br_blockcount); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - --*idx; + LEFT.br_blockcount += new->br_blockcount; + + old = PREV; + PREV.br_startoff += new->br_blockcount; + PREV.br_startblock += new->br_blockcount; + PREV.br_blockcount -= new->br_blockcount; + + xfs_iext_update_extent(ip, state, icur, &PREV); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &LEFT); if (cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, PREV.br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, &old, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, - PREV.br_startoff + new->br_blockcount, - PREV.br_startblock + new->br_blockcount, - PREV.br_blockcount - new->br_blockcount, - oldext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; - if ((error = xfs_btree_decrement(cur, 0, &i))) + error = xfs_btree_decrement(cur, 0, &i); + if (error) goto done; - error = xfs_bmbt_update(cur, LEFT.br_startoff, - LEFT.br_startblock, - LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state); + error = xfs_bmbt_update(cur, &LEFT); if (error) goto done; } @@ -2450,32 +2308,25 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); - xfs_bmbt_set_startoff(ep, new_endoff); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount - new->br_blockcount); - xfs_bmbt_set_startblock(ep, - new->br_startblock + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - - xfs_iext_insert(ip, *idx, 1, new, state); + old = PREV; + PREV.br_startoff += new->br_blockcount; + PREV.br_startblock += new->br_blockcount; + PREV.br_blockcount -= new->br_blockcount; + + xfs_iext_update_extent(ip, state, icur, &PREV); + xfs_iext_insert(ip, icur, new, state); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, PREV.br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, &old, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, - PREV.br_startoff + new->br_blockcount, - PREV.br_startblock + new->br_blockcount, - PREV.br_blockcount - new->br_blockcount, - oldext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; cur->bc_rec.b = *new; if ((error = xfs_btree_insert(cur, &i))) @@ -2489,39 +2340,33 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is contiguous with the new allocation. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + old = PREV; + PREV.br_blockcount -= new->br_blockcount; - ++*idx; + RIGHT.br_startoff = new->br_startoff; + RIGHT.br_startblock = new->br_startblock; + RIGHT.br_blockcount += new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), - new->br_startoff, new->br_startblock, - new->br_blockcount + RIGHT.br_blockcount, newext); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_update_extent(ip, state, icur, &PREV); + xfs_iext_next(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &RIGHT); if (cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, - PREV.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(cur, &old, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, PREV.br_startoff, - PREV.br_startblock, - PREV.br_blockcount - new->br_blockcount, - oldext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; - if ((error = xfs_btree_increment(cur, 0, &i))) + error = xfs_btree_increment(cur, 0, &i); + if (error) goto done; - if ((error = xfs_bmbt_update(cur, new->br_startoff, - new->br_startblock, - new->br_blockcount + RIGHT.br_blockcount, - newext))) + error = xfs_bmbt_update(cur, &RIGHT); + if (error) goto done; } break; @@ -2531,13 +2376,12 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, - PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + old = PREV; + PREV.br_blockcount -= new->br_blockcount; - ++*idx; - xfs_iext_insert(ip, *idx, 1, new, state); + xfs_iext_update_extent(ip, state, icur, &PREV); + xfs_iext_next(ifp, icur); + xfs_iext_insert(ip, icur, new, state); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); @@ -2545,22 +2389,17 @@ xfs_bmap_add_extent_unwritten_real( rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, PREV.br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, &old, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - if ((error = xfs_bmbt_update(cur, PREV.br_startoff, - PREV.br_startblock, - PREV.br_blockcount - new->br_blockcount, - oldext))) + error = xfs_bmbt_update(cur, &PREV); + if (error) goto done; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, new, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -2573,20 +2412,20 @@ xfs_bmap_add_extent_unwritten_real( * newext. Contiguity is impossible here. * One extent becomes three extents. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, - new->br_startoff - PREV.br_startoff); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + old = PREV; + PREV.br_blockcount = new->br_startoff - PREV.br_startoff; r[0] = *new; r[1].br_startoff = new_endoff; r[1].br_blockcount = - PREV.br_startoff + PREV.br_blockcount - new_endoff; + old.br_startoff + old.br_blockcount - new_endoff; r[1].br_startblock = new->br_startblock + new->br_blockcount; - r[1].br_state = oldext; + r[1].br_state = PREV.br_state; - ++*idx; - xfs_iext_insert(ip, *idx, 2, &r[0], state); + xfs_iext_update_extent(ip, state, icur, &PREV); + xfs_iext_next(ifp, icur); + xfs_iext_insert(ip, icur, &r[1], state); + xfs_iext_insert(ip, icur, &r[0], state); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 2); @@ -2594,20 +2433,16 @@ xfs_bmap_add_extent_unwritten_real( rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, - PREV.br_startblock, PREV.br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, &old, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); /* new right extent - oldext */ - if ((error = xfs_bmbt_update(cur, r[1].br_startoff, - r[1].br_startblock, r[1].br_blockcount, - r[1].br_state))) + error = xfs_bmbt_update(cur, &r[1]); + if (error) goto done; /* new left extent - oldext */ cur->bc_rec.b = PREV; - cur->bc_rec.b.br_blockcount = - new->br_startoff - PREV.br_startoff; if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -2616,13 +2451,11 @@ xfs_bmap_add_extent_unwritten_real( * we are about to insert as we can't trust it after * the previous insert. */ - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, - new->br_startblock, new->br_blockcount, - &i))) + error = xfs_bmbt_lookup_eq(cur, new, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); /* new middle extent - newext */ - cur->bc_rec.b.br_state = new->br_state; if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -2681,7 +2514,7 @@ STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ int whichfork, - xfs_extnum_t *idx, /* extent number to update/insert */ + struct xfs_iext_cursor *icur, xfs_bmbt_irec_t *new) /* new data to add to file extents */ { xfs_ifork_t *ifp; /* inode fork pointer */ @@ -2689,22 +2522,17 @@ xfs_bmap_add_extent_hole_delay( xfs_filblks_t newlen=0; /* new indirect size */ xfs_filblks_t oldlen=0; /* old indirect size */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ - int state; /* state bits, accessed thru macros */ - xfs_filblks_t temp=0; /* temp for indirect calculations */ + int state = xfs_bmap_fork_to_state(whichfork); + xfs_filblks_t temp; /* temp for indirect calculations */ ifp = XFS_IFORK_PTR(ip, whichfork); - state = 0; - if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; ASSERT(isnullstartblock(new->br_startblock)); /* * Check and set flags if this segment has a left neighbor */ - if (*idx > 0) { + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); - if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -2713,10 +2541,8 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < xfs_iext_count(ifp)) { + if (xfs_iext_get_extent(ifp, icur, &right)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); - if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -2748,22 +2574,20 @@ xfs_bmap_add_extent_hole_delay( * on the left and on the right. * Merge all three into a single extent record. */ - --*idx; temp = left.br_blockcount + new->br_blockcount + right.br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), oldlen); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), - nullstartblock((int)newlen)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + left.br_startblock = nullstartblock(newlen); + left.br_blockcount = temp; - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); break; case BMAP_LEFT_CONTIG: @@ -2772,18 +2596,17 @@ xfs_bmap_add_extent_hole_delay( * on the left. * Merge the new allocation with the left neighbor. */ - --*idx; temp = left.br_blockcount + new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), oldlen); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), - nullstartblock((int)newlen)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + left.br_blockcount = temp; + left.br_startblock = nullstartblock(newlen); + + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); break; case BMAP_RIGHT_CONTIG: @@ -2792,16 +2615,15 @@ xfs_bmap_add_extent_hole_delay( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), oldlen); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), - new->br_startoff, - nullstartblock((int)newlen), temp, right.br_state); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + right.br_startoff = new->br_startoff; + right.br_startblock = nullstartblock(newlen); + right.br_blockcount = temp; + xfs_iext_update_extent(ip, state, icur, &right); break; case 0: @@ -2811,7 +2633,7 @@ xfs_bmap_add_extent_hole_delay( * Insert a new entry. */ oldlen = newlen = 0; - xfs_iext_insert(ip, *idx, 1, new, state); + xfs_iext_insert(ip, icur, new, state); break; } if (oldlen != newlen) { @@ -2832,7 +2654,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, - xfs_extnum_t *idx, + struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, xfs_fsblock_t *first, @@ -2847,27 +2669,19 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ - int state; /* state bits, accessed thru macros */ + int state = xfs_bmap_fork_to_state(whichfork); + struct xfs_bmbt_irec old; - ASSERT(*idx >= 0); - ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); - state = 0; - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - /* * Check and set flags if this segment has a left neighbor. */ - if (*idx > 0) { + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -2876,9 +2690,8 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (*idx < xfs_iext_count(ifp)) { + if (xfs_iext_get_extent(ifp, icur, &right)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -2915,14 +2728,11 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), - left.br_blockcount + new->br_blockcount + - right.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + left.br_blockcount += new->br_blockcount + right.br_blockcount; - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); @@ -2930,9 +2740,7 @@ xfs_bmap_add_extent_hole_real( rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(cur, right.br_startoff, - right.br_startblock, right.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(cur, &right, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -2944,12 +2752,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(cur, left.br_startoff, - left.br_startblock, - left.br_blockcount + - new->br_blockcount + - right.br_blockcount, - left.br_state); + error = xfs_bmbt_update(cur, &left); if (error) goto done; } @@ -2961,27 +2764,21 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), - left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + old = left; + left.br_blockcount += new->br_blockcount; + + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - error = xfs_bmbt_lookup_eq(cur, left.br_startoff, - left.br_startblock, left.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(cur, left.br_startoff, - left.br_startblock, - left.br_blockcount + - new->br_blockcount, - left.br_state); + error = xfs_bmbt_update(cur, &left); if (error) goto done; } @@ -2993,29 +2790,22 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), - new->br_startoff, new->br_startblock, - new->br_blockcount + right.br_blockcount, - right.br_state); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + old = right; + + right.br_startoff = new->br_startoff; + right.br_startblock = new->br_startblock; + right.br_blockcount += new->br_blockcount; + xfs_iext_update_extent(ip, state, icur, &right); if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - error = xfs_bmbt_lookup_eq(cur, - right.br_startoff, - right.br_startblock, - right.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(cur, new->br_startoff, - new->br_startblock, - new->br_blockcount + - right.br_blockcount, - right.br_state); + error = xfs_bmbt_update(cur, &right); if (error) goto done; } @@ -3027,21 +2817,17 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(ip, *idx, 1, new, state); + xfs_iext_insert(ip, icur, new, state); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(cur, - new->br_startoff, - new->br_startblock, - new->br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - cur->bc_rec.b.br_state = new->br_state; error = xfs_btree_insert(cur, &i); if (error) goto done; @@ -3852,6 +3638,17 @@ xfs_trim_extent( } } +/* trim extent to within eof */ +void +xfs_trim_extent_eof( + struct xfs_bmbt_irec *irec, + struct xfs_inode *ip) + +{ + xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount, + i_size_read(VFS_I(ip)))); +} + /* * Trim the returned map to the required bounds */ @@ -3970,7 +3767,7 @@ xfs_bmapi_read( struct xfs_bmbt_irec got; xfs_fileoff_t obno; xfs_fileoff_t end; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; int error; bool eof = false; int n = 0; @@ -4012,7 +3809,7 @@ xfs_bmapi_read( return error; } - if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) + if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) eof = true; end = bno + len; obno = bno; @@ -4044,7 +3841,7 @@ xfs_bmapi_read( break; /* Else go on to the next record. */ - if (!xfs_iext_get_extent(ifp, ++idx, &got)) + if (!xfs_iext_next_extent(ifp, &icur, &got)) eof = true; } *nmap = n; @@ -4072,7 +3869,7 @@ xfs_bmapi_reserve_delalloc( xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, - xfs_extnum_t *lastx, + struct xfs_iext_cursor *icur, int eof) { struct xfs_mount *mp = ip->i_mount; @@ -4102,7 +3899,7 @@ xfs_bmapi_reserve_delalloc( if (extsz) { struct xfs_bmbt_irec prev; - if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev)) + if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) prev.br_startoff = NULLFILEOFF; error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof, @@ -4151,7 +3948,7 @@ xfs_bmapi_reserve_delalloc( got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); + xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); /* * Tag the inode if blocks were preallocated. Note that COW fork @@ -4196,10 +3993,7 @@ xfs_bmapi_allocate( if (bma->wasdel) { bma->length = (xfs_extlen_t)bma->got.br_blockcount; bma->offset = bma->got.br_startoff; - if (bma->idx) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), - &bma->prev); - } + xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev); } else { bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); if (!bma->eof) @@ -4284,7 +4078,7 @@ xfs_bmapi_allocate( error = xfs_bmap_add_extent_delay_real(bma, whichfork); else error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, - whichfork, &bma->idx, &bma->cur, &bma->got, + whichfork, &bma->icur, &bma->cur, &bma->got, bma->firstblock, bma->dfops, &bma->logflags); bma->logflags |= tmp_logflags; @@ -4296,7 +4090,7 @@ xfs_bmapi_allocate( * or xfs_bmap_add_extent_hole_real might have merged it into one of * the neighbouring ones. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + xfs_iext_get_extent(ifp, &bma->icur, &bma->got); ASSERT(bma->got.br_startoff <= bma->offset); ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= @@ -4354,8 +4148,8 @@ xfs_bmapi_convert_unwritten( } error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork, - &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops, - &tmp_logflags); + &bma->icur, &bma->cur, mval, bma->firstblock, + bma->dfops, &tmp_logflags); /* * Log the inode core unconditionally in the unwritten extent conversion * path because the conversion might not have done so (e.g., if the @@ -4377,7 +4171,7 @@ xfs_bmapi_convert_unwritten( * xfs_bmap_add_extent_unwritten_real might have merged it into one * of the neighbouring ones. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + xfs_iext_get_extent(ifp, &bma->icur, &bma->got); /* * We may have combined previously unwritten space with written space, @@ -4496,9 +4290,9 @@ xfs_bmapi_write( end = bno + len; obno = bno; - if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got)) + if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; - if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev)) + if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; bma.tp = tp; bma.ip = ip; @@ -4540,7 +4334,8 @@ xfs_bmapi_write( * First, deal with the hole before the allocated space * that we found, if any. */ - if (need_alloc || wasdelay) { + if ((need_alloc || wasdelay) && + !(flags & XFS_BMAPI_CONVERT_ONLY)) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; @@ -4603,7 +4398,7 @@ xfs_bmapi_write( /* Else go on to the next record. */ bma.prev = bma.got; - if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got)) + if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got)) eof = true; } *nmap = n; @@ -4676,7 +4471,7 @@ xfs_bmapi_remap( struct xfs_btree_cur *cur = NULL; xfs_fsblock_t firstblock = NULLFSBLOCK; struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; int logflags = 0, error; ASSERT(len > 0); @@ -4700,7 +4495,7 @@ xfs_bmapi_remap( return error; } - if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) { + if (xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) { /* make sure we only reflink into a hole. */ ASSERT(got.br_startoff > bno); ASSERT(got.br_startoff - bno >= len); @@ -4721,8 +4516,8 @@ xfs_bmapi_remap( got.br_blockcount = len; got.br_state = XFS_EXT_NORM; - error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur, - &got, &firstblock, dfops, &logflags); + error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur, + &cur, &got, &firstblock, dfops, &logflags); if (error) goto error0; @@ -4838,7 +4633,7 @@ int xfs_bmap_del_extent_delay( struct xfs_inode *ip, int whichfork, - xfs_extnum_t *idx, + struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del) { @@ -4848,7 +4643,8 @@ xfs_bmap_del_extent_delay( int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; xfs_filblks_t got_indlen, new_indlen, stolen; - int error = 0, state = 0; + int state = xfs_bmap_fork_to_state(whichfork); + int error = 0; bool isrt; XFS_STATS_INC(mp, xs_del_exlist); @@ -4859,8 +4655,6 @@ xfs_bmap_del_extent_delay( da_old = startblockval(got->br_startblock); da_new = 0; - ASSERT(*idx >= 0); - ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); @@ -4884,46 +4678,39 @@ xfs_bmap_del_extent_delay( return error; ip->i_delayed_blks -= del->br_blockcount; - if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - if (got->br_startoff == del->br_startoff) - state |= BMAP_LEFT_CONTIG; + state |= BMAP_LEFT_FILLING; if (got_endoff == del_endoff) - state |= BMAP_RIGHT_CONTIG; + state |= BMAP_RIGHT_FILLING; - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) { + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Matches the whole extent. Delete the entry. */ - xfs_iext_remove(ip, *idx, 1, state); - --*idx; + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); break; - case BMAP_LEFT_CONTIG: + case BMAP_LEFT_FILLING: /* * Deleting the first part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_startoff = del_endoff; got->br_blockcount -= del->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_update_extent(ip, state, icur, got); break; - case BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING: /* * Deleting the last part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount = got->br_blockcount - del->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_update_extent(ip, state, icur, got); break; case 0: /* @@ -4935,8 +4722,6 @@ xfs_bmap_del_extent_delay( * Warn if either of the new indlen reservations is zero as this * can lead to delalloc problems. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - got->br_blockcount = del->br_startoff - got->br_startoff; got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount); @@ -4948,15 +4733,14 @@ xfs_bmap_del_extent_delay( del->br_blockcount); got->br_startblock = nullstartblock((int)got_indlen); - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_); new.br_startoff = del_endoff; new.br_state = got->br_state; new.br_startblock = nullstartblock((int)new_indlen); - ++*idx; - xfs_iext_insert(ip, *idx, 1, &new, state); + xfs_iext_update_extent(ip, state, icur, got); + xfs_iext_next(ifp, icur); + xfs_iext_insert(ip, icur, &new, state); da_new = got_indlen + new_indlen - stolen; del->br_blockcount -= stolen; @@ -4975,7 +4759,7 @@ xfs_bmap_del_extent_delay( void xfs_bmap_del_extent_cow( struct xfs_inode *ip, - xfs_extnum_t *idx, + struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del) { @@ -4990,75 +4774,67 @@ xfs_bmap_del_extent_cow( del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got->br_startoff + got->br_blockcount; - ASSERT(*idx >= 0); - ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); ASSERT(!isnullstartblock(got->br_startblock)); if (got->br_startoff == del->br_startoff) - state |= BMAP_LEFT_CONTIG; + state |= BMAP_LEFT_FILLING; if (got_endoff == del_endoff) - state |= BMAP_RIGHT_CONTIG; + state |= BMAP_RIGHT_FILLING; - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) { + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Matches the whole extent. Delete the entry. */ - xfs_iext_remove(ip, *idx, 1, state); - --*idx; + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); break; - case BMAP_LEFT_CONTIG: + case BMAP_LEFT_FILLING: /* * Deleting the first part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_startoff = del_endoff; got->br_blockcount -= del->br_blockcount; got->br_startblock = del->br_startblock + del->br_blockcount; - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_update_extent(ip, state, icur, got); break; - case BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING: /* * Deleting the last part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount -= del->br_blockcount; - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_update_extent(ip, state, icur, got); break; case 0: /* * Deleting the middle of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount = del->br_startoff - got->br_startoff; - xfs_iext_update_extent(ifp, *idx, got); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); new.br_startoff = del_endoff; new.br_blockcount = got_endoff - del_endoff; new.br_state = got->br_state; new.br_startblock = del->br_startblock + del->br_blockcount; - ++*idx; - xfs_iext_insert(ip, *idx, 1, &new, state); + xfs_iext_update_extent(ip, state, icur, got); + xfs_iext_next(ifp, icur); + xfs_iext_insert(ip, icur, &new, state); break; } } /* * Called by xfs_bmapi to update file extent records and the btree - * after removing space (or undoing a delayed allocation). + * after removing space. */ STATIC int /* error */ -xfs_bmap_del_extent( +xfs_bmap_del_extent_real( xfs_inode_t *ip, /* incore inode pointer */ xfs_trans_t *tp, /* current transaction pointer */ - xfs_extnum_t *idx, /* extent number to update/delete */ + struct xfs_iext_cursor *icur, struct xfs_defer_ops *dfops, /* list of extents to be freed */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ @@ -5066,16 +4842,12 @@ xfs_bmap_del_extent( int whichfork, /* data or attr fork */ int bflags) /* bmapi flags */ { - xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ - xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ - int delay; /* current block is delayed allocated */ int do_fx; /* free extent at end of routine */ - xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ int error; /* error return value */ - int flags; /* inode logging flags */ - xfs_bmbt_irec_t got; /* current extent entry */ + int flags = 0;/* inode logging flags */ + struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -5084,103 +4856,81 @@ xfs_bmap_del_extent( xfs_bmbt_irec_t new; /* new record to be inserted */ /* REFERENCED */ uint qfield; /* quota field to update */ - xfs_filblks_t temp; /* for indirect length calculations */ - xfs_filblks_t temp2; /* for indirect length calculations */ - int state = 0; + int state = xfs_bmap_fork_to_state(whichfork); + struct xfs_bmbt_irec old; mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - else if (whichfork == XFS_COW_FORK) - state |= BMAP_COWFORK; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp))); ASSERT(del->br_blockcount > 0); - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_get_all(ep, &got); + xfs_iext_get_extent(ifp, icur, &got); ASSERT(got.br_startoff <= del->br_startoff); del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got.br_startoff + got.br_blockcount; ASSERT(got_endoff >= del_endoff); - delay = isnullstartblock(got.br_startblock); - ASSERT(isnullstartblock(del->br_startblock) == delay); - flags = 0; + ASSERT(!isnullstartblock(got.br_startblock)); qfield = 0; error = 0; + /* - * If deleting a real allocation, must free up the disk space. + * If it's the case where the directory code is running with no block + * reservation, and the deleted block is in the middle of its extent, + * and the resulting insert of an extent would cause transformation to + * btree format, then reject it. The calling code will then swap blocks + * around instead. We have to do this now, rather than waiting for the + * conversion to btree format, since the transaction will be dirty then. */ - if (!delay) { - flags = XFS_ILOG_CORE; - /* - * Realtime allocation. Free it and record di_nblocks update. - */ - if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_fsblock_t bno; - xfs_filblks_t len; - - ASSERT(do_mod(del->br_blockcount, - mp->m_sb.sb_rextsize) == 0); - ASSERT(do_mod(del->br_startblock, - mp->m_sb.sb_rextsize) == 0); - bno = del->br_startblock; - len = del->br_blockcount; - do_div(bno, mp->m_sb.sb_rextsize); - do_div(len, mp->m_sb.sb_rextsize); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; - do_fx = 0; - nblks = len * mp->m_sb.sb_rextsize; - qfield = XFS_TRANS_DQ_RTBCOUNT; - } - /* - * Ordinary allocation. - */ - else { - do_fx = 1; - nblks = del->br_blockcount; - qfield = XFS_TRANS_DQ_BCOUNT; - } - /* - * Set up del_endblock and cur for later. - */ - del_endblock = del->br_startblock + del->br_blockcount; - if (cur) { - if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, got.br_blockcount, - &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - } - da_old = da_new = 0; - } else { - da_old = startblockval(got.br_startblock); - da_new = 0; - nblks = 0; + if (tp->t_blk_res == 0 && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) >= + XFS_IFORK_MAXEXT(ip, whichfork) && + del->br_startoff > got.br_startoff && del_endoff < got_endoff) + return -ENOSPC; + + flags = XFS_ILOG_CORE; + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + qfield = XFS_TRANS_DQ_RTBCOUNT; + } else { + do_fx = 1; + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; } - /* - * Set flag value to use in switch statement. - * Left-contig is 2, right-contig is 1. - */ - switch (((got.br_startoff == del->br_startoff) << 1) | - (got_endoff == del_endoff)) { - case 3: + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + error = xfs_bmbt_lookup_eq(cur, &got, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + + if (got.br_startoff == del->br_startoff) + state |= BMAP_LEFT_FILLING; + if (got_endoff == del_endoff) + state |= BMAP_RIGHT_FILLING; + + switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) { + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Matches the whole extent. Delete the entry. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx, 1, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - --*idx; - if (delay) - break; - + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); flags |= XFS_ILOG_CORE; @@ -5192,168 +4942,106 @@ xfs_bmap_del_extent( goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); break; - - case 2: + case BMAP_LEFT_FILLING: /* * Deleting the first part of the extent. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startoff(ep, del_endoff); - temp = got.br_blockcount - del->br_blockcount; - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - xfs_bmbt_set_startblock(ep, del_endblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + got.br_startoff = del_endoff; + got.br_startblock = del_endblock; + got.br_blockcount -= del->br_blockcount; + xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { flags |= xfs_ilog_fext(whichfork); break; } - if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) + error = xfs_bmbt_update(cur, &got); + if (error) goto done; break; - - case 1: + case BMAP_RIGHT_FILLING: /* * Deleting the last part of the extent. */ - temp = got.br_blockcount - del->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + got.br_blockcount -= del->br_blockcount; + xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { flags |= xfs_ilog_fext(whichfork); break; } - if ((error = xfs_bmbt_update(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) + error = xfs_bmbt_update(cur, &got); + if (error) goto done; break; - case 0: /* * Deleting the middle of the extent. */ - temp = del->br_startoff - got.br_startoff; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); + old = got; + + got.br_blockcount = del->br_startoff - got.br_startoff; + xfs_iext_update_extent(ip, state, icur, &got); + new.br_startoff = del_endoff; - temp2 = got_endoff - del_endoff; - new.br_blockcount = temp2; + new.br_blockcount = got_endoff - del_endoff; new.br_state = got.br_state; - if (!delay) { - new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; - if (cur) { - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, temp, - got.br_state))) - goto done; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto done; - cur->bc_rec.b = new; - error = xfs_btree_insert(cur, &i); - if (error && error != -ENOSPC) - goto done; + new.br_startblock = del_endblock; + + flags |= XFS_ILOG_CORE; + if (cur) { + error = xfs_bmbt_update(cur, &got); + if (error) + goto done; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + cur->bc_rec.b = new; + error = xfs_btree_insert(cur, &i); + if (error && error != -ENOSPC) + goto done; + /* + * If get no-space back from btree insert, it tried a + * split, and we have a zero block reservation. Fix up + * our state and return the error. + */ + if (error == -ENOSPC) { /* - * If get no-space back from btree insert, - * it tried a split, and we have a zero - * block reservation. - * Fix up our state and return the error. + * Reset the cursor, don't trust it after any + * insert operation. */ - if (error == -ENOSPC) { - /* - * Reset the cursor, don't trust - * it after any insert operation. - */ - if ((error = xfs_bmbt_lookup_eq(cur, - got.br_startoff, - got.br_startblock, - temp, &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(mp, - i == 1, done); - /* - * Update the btree record back - * to the original value. - */ - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, - got.br_blockcount, - got.br_state))) - goto done; - /* - * Reset the extent record back - * to the original value. - */ - xfs_bmbt_set_blockcount(ep, - got.br_blockcount); - flags = 0; - error = -ENOSPC; + error = xfs_bmbt_lookup_eq(cur, &got, &i); + if (error) goto done; - } XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - } else - flags |= xfs_ilog_fext(whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - } else { - xfs_filblks_t stolen; - ASSERT(whichfork == XFS_DATA_FORK); - - /* - * Distribute the original indlen reservation across the - * two new extents. Steal blocks from the deleted extent - * if necessary. Stealing blocks simply fudges the - * fdblocks accounting in xfs_bunmapi(). - */ - temp = xfs_bmap_worst_indlen(ip, got.br_blockcount); - temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount); - stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2, - del->br_blockcount); - da_new = temp + temp2 - stolen; - del->br_blockcount -= stolen; - - /* - * Set the reservation for each extent. Warn if either - * is zero as this can lead to delalloc problems. - */ - WARN_ON_ONCE(!temp || !temp2); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - new.br_startblock = nullstartblock((int)temp2); - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_insert(ip, *idx + 1, 1, &new, state); - ++*idx; + /* + * Update the btree record back + * to the original value. + */ + error = xfs_bmbt_update(cur, &old); + if (error) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_iext_update_extent(ip, state, icur, &old); + flags = 0; + error = -ENOSPC; + goto done; + } + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } else + flags |= xfs_ilog_fext(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + xfs_iext_next(ifp, icur); + xfs_iext_insert(ip, icur, &new, state); break; } /* remove reverse mapping */ - if (!delay) { - error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del); - if (error) - goto done; - } + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del); + if (error) + goto done; /* * If we need to, add to list of extents to delete. @@ -5379,13 +5067,6 @@ xfs_bmap_del_extent( if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); - /* - * Account for change in delayed indirect blocks. - * Nothing to do for disk quota accounting here. - */ - ASSERT(da_old >= da_new); - if (da_old > da_new) - xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); done: *logflagsp = flags; return error; @@ -5401,7 +5082,7 @@ int /* error */ __xfs_bunmapi( xfs_trans_t *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting offset to unmap */ + xfs_fileoff_t start, /* first file offset deleted */ xfs_filblks_t *rlen, /* i/o: amount remaining */ int flags, /* misc flags */ xfs_extnum_t nexts, /* number of extents max */ @@ -5416,11 +5097,9 @@ __xfs_bunmapi( xfs_bmbt_irec_t got; /* current extent record */ xfs_ifork_t *ifp; /* inode fork pointer */ int isrt; /* freeing in rt area */ - xfs_extnum_t lastx; /* last extent index used */ int logflags; /* transaction logging flags */ xfs_extlen_t mod; /* rt extent offset */ xfs_mount_t *mp; /* mount structure */ - xfs_fileoff_t start; /* first file offset deleted */ int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ @@ -5428,8 +5107,11 @@ __xfs_bunmapi( xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t max_len; xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; + xfs_fileoff_t end; + struct xfs_iext_cursor icur; + bool done = false; - trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); + trace_xfs_bunmap(ip, start, len, flags, _RET_IP_); whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); @@ -5468,18 +5150,13 @@ __xfs_bunmapi( } XFS_STATS_INC(mp, xs_blk_unmap); isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); - start = bno; - bno = start + len - 1; + end = start + len; - /* - * Check to see if the given block number is past the end of the - * file, back up to the last block if so... - */ - if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) { - ASSERT(lastx > 0); - xfs_iext_get_extent(ifp, --lastx, &got); - bno = got.br_startoff + got.br_blockcount - 1; + if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) { + *rlen = 0; + return 0; } + end--; logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { @@ -5502,24 +5179,24 @@ __xfs_bunmapi( } extno = 0; - while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && + while (end != (xfs_fileoff_t)-1 && end >= start && (nexts == 0 || extno < nexts) && max_len > 0) { /* - * Is the found extent after a hole in which bno lives? + * Is the found extent after a hole in which end lives? * Just back up to the previous extent, if so. */ - if (got.br_startoff > bno) { - if (--lastx < 0) - break; - xfs_iext_get_extent(ifp, lastx, &got); + if (got.br_startoff > end && + !xfs_iext_prev_extent(ifp, &icur, &got)) { + done = true; + break; } /* * Is the last block of this extent before the range * we're supposed to delete? If so, we're done. */ - bno = XFS_FILEOFF_MIN(bno, + end = XFS_FILEOFF_MIN(end, got.br_startoff + got.br_blockcount - 1); - if (bno < start) + if (end < start) break; /* * Then deal with the (possibly delayed) allocated space @@ -5544,8 +5221,8 @@ __xfs_bunmapi( if (!wasdel) del.br_startblock += start - got.br_startoff; } - if (del.br_startoff + del.br_blockcount > bno + 1) - del.br_blockcount = bno + 1 - del.br_startoff; + if (del.br_startoff + del.br_blockcount > end + 1) + del.br_blockcount = end + 1 - del.br_startoff; /* How much can we safely unmap? */ if (max_len < del.br_blockcount) { @@ -5571,13 +5248,13 @@ __xfs_bunmapi( * This piece is unwritten, or we're not * using unwritten extents. Skip over it. */ - ASSERT(bno >= mod); - bno -= mod > del.br_blockcount ? + ASSERT(end >= mod); + end -= mod > del.br_blockcount ? del.br_blockcount : mod; - if (bno < got.br_startoff) { - if (--lastx >= 0) - xfs_bmbt_get_all(xfs_iext_get_ext( - ifp, lastx), &got); + if (end < got.br_startoff && + !xfs_iext_prev_extent(ifp, &icur, &got)) { + done = true; + break; } continue; } @@ -5598,7 +5275,7 @@ __xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, - whichfork, &lastx, &cur, &del, + whichfork, &icur, &cur, &del, firstblock, dfops, &logflags); if (error) goto error0; @@ -5623,10 +5300,13 @@ __xfs_bunmapi( * Can't make it unwritten. There isn't * a full extent here so just skip it. */ - ASSERT(bno >= del.br_blockcount); - bno -= del.br_blockcount; - if (got.br_startoff > bno && --lastx >= 0) - xfs_iext_get_extent(ifp, lastx, &got); + ASSERT(end >= del.br_blockcount); + end -= del.br_blockcount; + if (got.br_startoff > end && + !xfs_iext_prev_extent(ifp, &icur, &got)) { + done = true; + break; + } continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { struct xfs_bmbt_irec prev; @@ -5637,8 +5317,8 @@ __xfs_bunmapi( * Unwrite the killed part of that one and * try again. */ - ASSERT(lastx > 0); - xfs_iext_get_extent(ifp, lastx - 1, &prev); + if (!xfs_iext_prev_extent(ifp, &icur, &prev)) + ASSERT(0); ASSERT(prev.br_state == XFS_EXT_NORM); ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == @@ -5650,9 +5330,8 @@ __xfs_bunmapi( prev.br_startoff = start; } prev.br_state = XFS_EXT_UNWRITTEN; - lastx--; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, whichfork, &lastx, &cur, + ip, whichfork, &icur, &cur, &prev, firstblock, dfops, &logflags); if (error) @@ -5662,7 +5341,7 @@ __xfs_bunmapi( ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, whichfork, &lastx, &cur, + ip, whichfork, &icur, &cur, &del, firstblock, dfops, &logflags); if (error) @@ -5671,85 +5350,39 @@ __xfs_bunmapi( } } - /* - * If it's the case where the directory code is running - * with no block reservation, and the deleted block is in - * the middle of its extent, and the resulting insert - * of an extent would cause transformation to btree format, - * then reject it. The calling code will then swap - * blocks around instead. - * We have to do this now, rather than waiting for the - * conversion to btree format, since the transaction - * will be dirty. - */ - if (!wasdel && tp->t_blk_res == 0 && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ - XFS_IFORK_MAXEXT(ip, whichfork) && - del.br_startoff > got.br_startoff && - del.br_startoff + del.br_blockcount < - got.br_startoff + got.br_blockcount) { - error = -ENOSPC; - goto error0; + if (wasdel) { + error = xfs_bmap_del_extent_delay(ip, whichfork, &icur, + &got, &del); + } else { + error = xfs_bmap_del_extent_real(ip, tp, &icur, dfops, + cur, &del, &tmp_logflags, whichfork, + flags); + logflags |= tmp_logflags; } - /* - * Unreserve quota and update realtime free space, if - * appropriate. If delayed allocation, update the inode delalloc - * counter now and wait to update the sb counters as - * xfs_bmap_del_extent() might need to borrow some blocks. - */ - if (wasdel) { - ASSERT(startblockval(del.br_startblock) > 0); - if (isrt) { - xfs_filblks_t rtexts; - - rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); - do_div(rtexts, mp->m_sb.sb_rextsize); - xfs_mod_frextents(mp, (int64_t)rtexts); - (void)xfs_trans_reserve_quota_nblks(NULL, - ip, -((long)del.br_blockcount), 0, - XFS_QMOPT_RES_RTBLKS); - } else { - (void)xfs_trans_reserve_quota_nblks(NULL, - ip, -((long)del.br_blockcount), 0, - XFS_QMOPT_RES_REGBLKS); - } - ip->i_delayed_blks -= del.br_blockcount; - if (cur) - cur->bc_private.b.flags |= - XFS_BTCUR_BPRV_WASDEL; - } else if (cur) - cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; - - error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del, - &tmp_logflags, whichfork, flags); - logflags |= tmp_logflags; if (error) goto error0; - if (!isrt && wasdel) - xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false); - max_len -= del.br_blockcount; - bno = del.br_startoff - 1; + end = del.br_startoff - 1; nodelete: /* * If not done go on to the next (previous) record. */ - if (bno != (xfs_fileoff_t)-1 && bno >= start) { - if (lastx >= 0) { - xfs_iext_get_extent(ifp, lastx, &got); - if (got.br_startoff > bno && --lastx >= 0) - xfs_iext_get_extent(ifp, lastx, &got); + if (end != (xfs_fileoff_t)-1 && end >= start) { + if (!xfs_iext_get_extent(ifp, &icur, &got) || + (got.br_startoff > end && + !xfs_iext_prev_extent(ifp, &icur, &got))) { + done = true; + break; } extno++; } } - if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0) + if (done || end == (xfs_fileoff_t)-1 || end < start) *rlen = 0; else - *rlen = bno - start + 1; + *rlen = end - start + 1; /* * Convert to a btree if necessary. @@ -5867,14 +5500,13 @@ xfs_bmse_merge( struct xfs_inode *ip, int whichfork, xfs_fileoff_t shift, /* shift fsb */ - int current_ext, /* idx of gotp */ + struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, /* extent to shift */ struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_btree_cur *cur, int *logflags, /* output */ struct xfs_defer_ops *dfops) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; @@ -5902,8 +5534,7 @@ xfs_bmse_merge( } /* lookup and remove the extent to merge */ - error = xfs_bmbt_lookup_eq(cur, got->br_startoff, got->br_startblock, - got->br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, got, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); @@ -5914,20 +5545,20 @@ xfs_bmse_merge( XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* lookup and update size of the previous extent */ - error = xfs_bmbt_lookup_eq(cur, left->br_startoff, left->br_startblock, - left->br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, left, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); - error = xfs_bmbt_update(cur, new.br_startoff, new.br_startblock, - new.br_blockcount, new.br_state); + error = xfs_bmbt_update(cur, &new); if (error) return error; done: - xfs_iext_update_extent(ifp, current_ext - 1, &new); - xfs_iext_remove(ip, current_ext, 1, 0); + xfs_iext_remove(ip, icur, 0); + xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); + xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur, + &new); /* update reverse mapping. rmap functions merge the rmaps for us */ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); @@ -5938,183 +5569,83 @@ done: return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } -/* - * Shift a single extent. - */ -STATIC int -xfs_bmse_shift_one( - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t offset_shift_fsb, - int *current_ext, - struct xfs_bmbt_irec *got, - struct xfs_btree_cur *cur, - int *logflags, - enum shift_direction direction, - struct xfs_defer_ops *dfops) +static int +xfs_bmap_shift_update_extent( + struct xfs_inode *ip, + int whichfork, + struct xfs_iext_cursor *icur, + struct xfs_bmbt_irec *got, + struct xfs_btree_cur *cur, + int *logflags, + struct xfs_defer_ops *dfops, + xfs_fileoff_t startoff) { - struct xfs_ifork *ifp; - struct xfs_mount *mp; - xfs_fileoff_t startoff; - struct xfs_bmbt_irec adj_irec, new; - int error; - int i; - int total_extents; - - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - total_extents = xfs_iext_count(ifp); - - /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got->br_startblock)); - - if (direction == SHIFT_LEFT) { - startoff = got->br_startoff - offset_shift_fsb; - - /* - * Check for merge if we've got an extent to the left, - * otherwise make sure there's enough room at the start - * of the file for the shift. - */ - if (!*current_ext) { - if (got->br_startoff < offset_shift_fsb) - return -EINVAL; - goto update_current_ext; - } - - /* - * grab the left extent and check for a large enough hole. - */ - xfs_iext_get_extent(ifp, *current_ext - 1, &adj_irec); - if (startoff < adj_irec.br_startoff + adj_irec.br_blockcount) - return -EINVAL; - - /* check whether to merge the extent or shift it down */ - if (xfs_bmse_can_merge(&adj_irec, got, offset_shift_fsb)) { - return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, got, &adj_irec, - cur, logflags, dfops); - } - } else { - startoff = got->br_startoff + offset_shift_fsb; - /* nothing to move if this is the last extent */ - if (*current_ext >= (total_extents - 1)) - goto update_current_ext; - - /* - * If this is not the last extent in the file, make sure there - * is enough room between current extent and next extent for - * accommodating the shift. - */ - xfs_iext_get_extent(ifp, *current_ext + 1, &adj_irec); - if (startoff + got->br_blockcount > adj_irec.br_startoff) - return -EINVAL; - - /* - * Unlike a left shift (which involves a hole punch), - * a right shift does not modify extent neighbors - * in any way. We should never find mergeable extents - * in this scenario. Check anyways and warn if we - * encounter two extents that could be one. - */ - if (xfs_bmse_can_merge(got, &adj_irec, offset_shift_fsb)) - WARN_ON_ONCE(1); - } + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec prev = *got; + int error, i; - /* - * Increment the extent index for the next iteration, update the start - * offset of the in-core extent and update the btree if applicable. - */ -update_current_ext: *logflags |= XFS_ILOG_CORE; - new = *got; - new.br_startoff = startoff; + got->br_startoff = startoff; if (cur) { - error = xfs_bmbt_lookup_eq(cur, got->br_startoff, - got->br_startblock, got->br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, &prev, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); - error = xfs_bmbt_update(cur, new.br_startoff, - new.br_startblock, new.br_blockcount, - new.br_state); + error = xfs_bmbt_update(cur, got); if (error) return error; } else { *logflags |= XFS_ILOG_DEXT; } - xfs_iext_update_extent(ifp, *current_ext, &new); - - if (direction == SHIFT_LEFT) - (*current_ext)++; - else - (*current_ext)--; + xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur, + got); /* update reverse mapping */ - error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &prev); if (error) return error; - return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, got); } -/* - * Shift extent records to the left/right to cover/create a hole. - * - * The maximum number of extents to be shifted in a single operation is - * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the - * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb - * is the length by which each extent is shifted. If there is no hole to shift - * the extents into, this will be considered invalid operation and we abort - * immediately. - */ int -xfs_bmap_shift_extents( +xfs_bmap_collapse_extents( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, - int *done, + bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, - struct xfs_defer_ops *dfops, - enum shift_direction direction, - int num_exts) + struct xfs_defer_ops *dfops) { - struct xfs_btree_cur *cur = NULL; - struct xfs_bmbt_irec got; - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; - xfs_extnum_t nexts = 0; - xfs_extnum_t current_ext; - xfs_extnum_t total_extents; - xfs_extnum_t stop_extent; - int error = 0; - int whichfork = XFS_DATA_FORK; - int logflags = 0; + int whichfork = XFS_DATA_FORK; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_irec got, prev; + struct xfs_iext_cursor icur; + xfs_fileoff_t new_startoff; + int error = 0; + int logflags = 0; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmap_shift_extents", - XFS_ERRLEVEL_LOW, mp); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); - ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { - /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; @@ -6127,107 +5658,165 @@ xfs_bmap_shift_extents( cur->bc_private.b.flags = 0; } - /* - * There may be delalloc extents in the data fork before the range we - * are collapsing out, so we cannot use the count of real extents here. - * Instead we have to calculate it from the incore fork. - */ - total_extents = xfs_iext_count(ifp); - if (total_extents == 0) { - *done = 1; + if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { + *done = true; goto del_cursor; } + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); - /* - * In case of first right shift, we need to initialize next_fsb - */ - if (*next_fsb == NULLFSBLOCK) { - ASSERT(direction == SHIFT_RIGHT); - - current_ext = total_extents - 1; - xfs_iext_get_extent(ifp, current_ext, &got); - if (stop_fsb > got.br_startoff) { - *done = 1; + new_startoff = got.br_startoff - offset_shift_fsb; + if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) { + if (new_startoff < prev.br_startoff + prev.br_blockcount) { + error = -EINVAL; goto del_cursor; } - *next_fsb = got.br_startoff; + + if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) { + error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + &icur, &got, &prev, cur, &logflags, + dfops); + if (error) + goto del_cursor; + goto done; + } } else { - /* - * Look up the extent index for the fsb where we start shifting. We can - * henceforth iterate with current_ext as extent list changes are locked - * out via ilock. - * - * If next_fsb lies in a hole beyond which there are no extents we are - * done. - */ - if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, ¤t_ext, - &got)) { - *done = 1; + if (got.br_startoff < offset_shift_fsb) { + error = -EINVAL; goto del_cursor; } } - /* Lookup the extent index at which we have to stop */ - if (direction == SHIFT_RIGHT) { - struct xfs_bmbt_irec s; + error = xfs_bmap_shift_update_extent(ip, whichfork, &icur, &got, cur, + &logflags, dfops, new_startoff); + if (error) + goto del_cursor; + +done: + if (!xfs_iext_next_extent(ifp, &icur, &got)) { + *done = true; + goto del_cursor; + } + + *next_fsb = got.br_startoff; +del_cursor: + if (cur) + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + return error; +} + +int +xfs_bmap_insert_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *next_fsb, + xfs_fileoff_t offset_shift_fsb, + bool *done, + xfs_fileoff_t stop_fsb, + xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops) +{ + int whichfork = XFS_DATA_FORK; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_irec got, next; + struct xfs_iext_cursor icur; + xfs_fileoff_t new_startoff; + int error = 0; + int logflags = 0; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT))) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); - xfs_iext_lookup_extent(ip, ifp, stop_fsb, &stop_extent, &s); - /* Make stop_extent exclusive of shift range */ - stop_extent--; - if (current_ext <= stop_extent) { - error = -EIO; + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.dfops = dfops; + cur->bc_private.b.flags = 0; + } + + if (*next_fsb == NULLFSBLOCK) { + xfs_iext_last(ifp, &icur); + if (!xfs_iext_get_extent(ifp, &icur, &got) || + stop_fsb > got.br_startoff) { + *done = true; goto del_cursor; } } else { - stop_extent = total_extents; - if (current_ext >= stop_extent) { - error = -EIO; + if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { + *done = true; goto del_cursor; } } + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); - while (nexts++ < num_exts) { - error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, - ¤t_ext, &got, cur, &logflags, - direction, dfops); - if (error) + if (stop_fsb >= got.br_startoff + got.br_blockcount) { + error = -EIO; + goto del_cursor; + } + + new_startoff = got.br_startoff + offset_shift_fsb; + if (xfs_iext_peek_next_extent(ifp, &icur, &next)) { + if (new_startoff + got.br_blockcount > next.br_startoff) { + error = -EINVAL; goto del_cursor; - /* - * If there was an extent merge during the shift, the extent - * count can change. Update the total and grade the next record. - */ - if (direction == SHIFT_LEFT) { - total_extents = xfs_iext_count(ifp); - stop_extent = total_extents; } - if (current_ext == stop_extent) { - *done = 1; - *next_fsb = NULLFSBLOCK; - break; - } - xfs_iext_get_extent(ifp, current_ext, &got); + /* + * Unlike a left shift (which involves a hole punch), a right + * shift does not modify extent neighbors in any way. We should + * never find mergeable extents in this scenario. Check anyways + * and warn if we encounter two extents that could be one. + */ + if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb)) + WARN_ON_ONCE(1); } - if (!*done) - *next_fsb = got.br_startoff; + error = xfs_bmap_shift_update_extent(ip, whichfork, &icur, &got, cur, + &logflags, dfops, new_startoff); + if (error) + goto del_cursor; + + if (!xfs_iext_prev_extent(ifp, &icur, &got) || + stop_fsb >= got.br_startoff + got.br_blockcount) { + *done = true; + goto del_cursor; + } + *next_fsb = got.br_startoff; del_cursor: if (cur) xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); - if (logflags) xfs_trans_log_inode(tp, ip, logflags); - return error; } /* - * Splits an extent into two extents at split_fsb block such that it is - * the first block of the current_ext. @current_ext is a target extent - * to be split. @split_fsb is a block where the extents is split. - * If split_fsb lies in a hole or the first block of extents, just return 0. + * Splits an extent into two extents at split_fsb block such that it is the + * first block of the current_ext. @ext is a target extent to be split. + * @split_fsb is a block where the extents is split. If split_fsb lies in a + * hole or the first block of extents, just return 0. */ STATIC int xfs_bmap_split_extent_at( @@ -6244,7 +5833,7 @@ xfs_bmap_split_extent_at( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; xfs_fsblock_t gotblkcnt; /* new block count for got */ - xfs_extnum_t current_ext; + struct xfs_iext_cursor icur; int error = 0; int logflags = 0; int i = 0; @@ -6272,7 +5861,7 @@ xfs_bmap_split_extent_at( /* * If there are not extents, or split_fsb lies in a hole we are done. */ - if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, ¤t_ext, &got) || + if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, &icur, &got) || got.br_startoff >= split_fsb) return 0; @@ -6287,44 +5876,35 @@ xfs_bmap_split_extent_at( cur->bc_private.b.firstblock = *firstfsb; cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); } got.br_blockcount = gotblkcnt; - xfs_iext_update_extent(ifp, current_ext, &got); + xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), &icur, + &got); logflags = XFS_ILOG_CORE; if (cur) { - error = xfs_bmbt_update(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount, - got.br_state); + error = xfs_bmbt_update(cur, &got); if (error) goto del_cursor; } else logflags |= XFS_ILOG_DEXT; /* Add new extent */ - current_ext++; - xfs_iext_insert(ip, current_ext, 1, &new, 0); + xfs_iext_next(ifp, &icur); + xfs_iext_insert(ip, &icur, &new, 0); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur) { - error = xfs_bmbt_lookup_eq(cur, new.br_startoff, - new.br_startblock, new.br_blockcount, - &i); + error = xfs_bmbt_lookup_eq(cur, &new, &i); if (error) goto del_cursor; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor); - cur->bc_rec.b.br_state = new.br_state; - error = xfs_btree_insert(cur, &i); if (error) goto del_cursor; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 851982a5dfbc..e36d75799cd5 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -43,7 +43,7 @@ struct xfs_bmalloca { xfs_fsblock_t blkno; /* starting block of new extent */ struct xfs_btree_cur *cur; /* btree cursor */ - xfs_extnum_t idx; /* current extent index */ + struct xfs_iext_cursor icur; /* incore extent cursor */ int nallocs;/* number of extents alloc'd */ int logflags;/* flags for transaction logging */ @@ -113,6 +113,9 @@ struct xfs_extent_free_item /* Only convert delalloc space, don't allocate entirely new extents */ #define XFS_BMAPI_DELALLOC 0x400 +/* Only convert unwritten extents, don't allocate new blocks */ +#define XFS_BMAPI_CONVERT_ONLY 0x800 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -124,7 +127,8 @@ struct xfs_extent_free_item { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ - { XFS_BMAPI_DELALLOC, "DELALLOC" } + { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ + { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" } static inline int xfs_bmapi_aflag(int w) @@ -183,31 +187,9 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) !isnullstartblock(irec->br_startblock); } -/* - * This macro is used to determine how many extents will be shifted - * in one write transaction. We could require two splits, - * an extent move on the first and an extent merge on the second, - * So it is proper that one extent is shifted inside write transaction - * at a time. - */ -#define XFS_BMAP_MAX_SHIFT_EXTENTS 1 - -enum shift_direction { - SHIFT_LEFT = 0, - SHIFT_RIGHT, -}; - -#ifdef DEBUG -void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, - int whichfork, unsigned long caller_ip); -#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ - xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) -#else -#define XFS_BMAP_TRACE_EXLIST(ip,c,w) -#endif - void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); +void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, @@ -221,8 +203,6 @@ int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); -int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, - int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); @@ -240,20 +220,25 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops, int *done); int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, - xfs_extnum_t *idx, struct xfs_bmbt_irec *got, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del); +void xfs_bmap_del_extent_cow(struct xfs_inode *ip, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); -void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx, - struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); uint xfs_default_attroffset(struct xfs_inode *ip); -int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, +int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, + bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops); +int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, - int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, - struct xfs_defer_ops *dfops, enum shift_direction direction, - int num_exts); + bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof); + struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, + int eof); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, @@ -277,4 +262,16 @@ int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); +static inline int xfs_bmap_fork_to_state(int whichfork) +{ + switch (whichfork) { + case XFS_ATTR_FORK: + return BMAP_ATTRFORK; + case XFS_COW_FORK: + return BMAP_COWFORK; + default: + return 0; + } +} + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index a6331ffa51e3..c10aecaaae44 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -38,22 +38,6 @@ #include "xfs_rmap.h" /* - * Determine the extent state. - */ -/* ARGSUSED */ -STATIC xfs_exntst_t -xfs_extent_state( - xfs_filblks_t blks, - int extent_flag) -{ - if (extent_flag) { - ASSERT(blks != 0); /* saved for DMIG */ - return XFS_EXT_UNWRITTEN; - } - return XFS_EXT_NORM; -} - -/* * Convert on-disk form of btree root to in-memory form. */ void @@ -87,84 +71,21 @@ xfs_bmdr_to_bmbt( memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } -/* - * Convert a compressed bmap extent record to an uncompressed form. - * This code must be in sync with the routines xfs_bmbt_get_startoff, - * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. - */ -STATIC void -__xfs_bmbt_get_all( - uint64_t l0, - uint64_t l1, - xfs_bmbt_irec_t *s) -{ - int ext_flag; - xfs_exntst_t st; - - ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); - s->br_startoff = ((xfs_fileoff_t)l0 & - xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; - s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) | - (((xfs_fsblock_t)l1) >> 21); - s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21)); - /* This is xfs_extent_state() in-line */ - if (ext_flag) { - ASSERT(s->br_blockcount != 0); /* saved for DMIG */ - st = XFS_EXT_UNWRITTEN; - } else - st = XFS_EXT_NORM; - s->br_state = st; -} - void -xfs_bmbt_get_all( - xfs_bmbt_rec_host_t *r, - xfs_bmbt_irec_t *s) -{ - __xfs_bmbt_get_all(r->l0, r->l1, s); -} - -/* - * Extract the blockcount field from an in memory bmap extent record. - */ -xfs_filblks_t -xfs_bmbt_get_blockcount( - xfs_bmbt_rec_host_t *r) -{ - return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21)); -} - -/* - * Extract the startblock field from an in memory bmap extent record. - */ -xfs_fsblock_t -xfs_bmbt_get_startblock( - xfs_bmbt_rec_host_t *r) -{ - return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) | - (((xfs_fsblock_t)r->l1) >> 21); -} - -/* - * Extract the startoff field from an in memory bmap extent record. - */ -xfs_fileoff_t -xfs_bmbt_get_startoff( - xfs_bmbt_rec_host_t *r) -{ - return ((xfs_fileoff_t)r->l0 & - xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; -} - -xfs_exntst_t -xfs_bmbt_get_state( - xfs_bmbt_rec_host_t *r) -{ - int ext_flag; - - ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN)); - return xfs_extent_state(xfs_bmbt_get_blockcount(r), - ext_flag); +xfs_bmbt_disk_get_all( + struct xfs_bmbt_rec *rec, + struct xfs_bmbt_irec *irec) +{ + uint64_t l0 = get_unaligned_be64(&rec->l0); + uint64_t l1 = get_unaligned_be64(&rec->l1); + + irec->br_startoff = (l0 & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; + irec->br_startblock = ((l0 & xfs_mask64lo(9)) << 43) | (l1 >> 21); + irec->br_blockcount = l1 & xfs_mask64lo(21); + if (l0 >> (64 - BMBT_EXNTFLAG_BITLEN)) + irec->br_state = XFS_EXT_UNWRITTEN; + else + irec->br_state = XFS_EXT_NORM; } /* @@ -188,142 +109,29 @@ xfs_bmbt_disk_get_startoff( xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; } - -/* - * Set all the fields in a bmap extent record from the arguments. - */ -void -xfs_bmbt_set_allf( - xfs_bmbt_rec_host_t *r, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state) -{ - int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; - - ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); - ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); - ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); - - ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); - - r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)startoff << 9) | - ((xfs_bmbt_rec_base_t)startblock >> 43); - r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | - ((xfs_bmbt_rec_base_t)blockcount & - (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); -} - /* * Set all the fields in a bmap extent record from the uncompressed form. */ void -xfs_bmbt_set_all( - xfs_bmbt_rec_host_t *r, - xfs_bmbt_irec_t *s) -{ - xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock, - s->br_blockcount, s->br_state); -} - - -/* - * Set all the fields in a disk format bmap extent record from the arguments. - */ -void -xfs_bmbt_disk_set_allf( - xfs_bmbt_rec_t *r, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state) -{ - int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; - - ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); - ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); - ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); - ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); - - r->l0 = cpu_to_be64( - ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)startoff << 9) | - ((xfs_bmbt_rec_base_t)startblock >> 43)); - r->l1 = cpu_to_be64( - ((xfs_bmbt_rec_base_t)startblock << 21) | - ((xfs_bmbt_rec_base_t)blockcount & - (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); -} - -/* - * Set all the fields in a bmap extent record from the uncompressed form. - */ -STATIC void xfs_bmbt_disk_set_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) -{ - xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock, - s->br_blockcount, s->br_state); -} - -/* - * Set the blockcount field in a bmap extent record. - */ -void -xfs_bmbt_set_blockcount( - xfs_bmbt_rec_host_t *r, - xfs_filblks_t v) + struct xfs_bmbt_rec *r, + struct xfs_bmbt_irec *s) { - ASSERT((v & xfs_mask64hi(43)) == 0); - r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) | - (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21)); -} - -/* - * Set the startblock field in a bmap extent record. - */ -void -xfs_bmbt_set_startblock( - xfs_bmbt_rec_host_t *r, - xfs_fsblock_t v) -{ - ASSERT((v & xfs_mask64hi(12)) == 0); - r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) | - (xfs_bmbt_rec_base_t)(v >> 43); - r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | - (xfs_bmbt_rec_base_t)(v << 21); -} + int extent_flag = (s->br_state != XFS_EXT_NORM); -/* - * Set the startoff field in a bmap extent record. - */ -void -xfs_bmbt_set_startoff( - xfs_bmbt_rec_host_t *r, - xfs_fileoff_t v) -{ - ASSERT((v & xfs_mask64hi(9)) == 0); - r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) | - ((xfs_bmbt_rec_base_t)v << 9) | - (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9)); -} + ASSERT(s->br_state == XFS_EXT_NORM || s->br_state == XFS_EXT_UNWRITTEN); + ASSERT(!(s->br_startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN))); + ASSERT(!(s->br_blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN))); + ASSERT(!(s->br_startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN))); -/* - * Set the extent state field in a bmap extent record. - */ -void -xfs_bmbt_set_state( - xfs_bmbt_rec_host_t *r, - xfs_exntst_t v) -{ - ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN); - if (v == XFS_EXT_NORM) - r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN); - else - r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN); + put_unaligned_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | + ((xfs_bmbt_rec_base_t)s->br_startblock >> 43), &r->l0); + put_unaligned_be64( + ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21)), &r->l1); } /* diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 9da5a8d4f184..135b8c56d23e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -98,25 +98,11 @@ struct xfs_trans; */ extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); -extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); -extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); -extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); -extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); -extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); +void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s); extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); - -extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); -extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o, - xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); -extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v); -extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v); -extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); -extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); - -extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, - xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); +extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, xfs_bmdr_block_t *, int); @@ -136,9 +122,9 @@ extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, * Check that the extent does not contain an invalid unwritten extent flag. */ static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork, - struct xfs_bmbt_rec_host *ep) + struct xfs_bmbt_irec *irec) { - if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0) + if (irec->br_state == XFS_EXT_NORM) return true; if (whichfork == XFS_DATA_FORK && xfs_sb_version_hasextflgbit(&mp->m_sb)) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5bfb88261c7e..5f33adf8eecb 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -29,6 +29,7 @@ #include "xfs_inode_item.h" #include "xfs_buf_item.h" #include "xfs_btree.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" @@ -63,44 +64,63 @@ xfs_btree_magic( return magic; } -STATIC int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_lblock( - struct xfs_btree_cur *cur, /* btree cursor */ - struct xfs_btree_block *block, /* btree long form block pointer */ - int level, /* level of the btree block */ - struct xfs_buf *bp) /* buffer for block, if any */ +/* + * Check a long btree block header. Return the address of the failing check, + * or NULL if everything is ok. + */ +xfs_failaddr_t +__xfs_btree_check_lblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) { - int lblock_ok = 1; /* block passes checks */ - struct xfs_mount *mp; /* file system mount point */ + struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; - int crc; - - mp = cur->bc_mp; - crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_sb_version_hascrc(&mp->m_sb); if (crc) { - lblock_ok = lblock_ok && - uuid_equal(&block->bb_u.l.bb_uuid, - &mp->m_sb.sb_meta_uuid) && - block->bb_u.l.bb_blkno == cpu_to_be64( - bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (block->bb_u.l.bb_blkno != + cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL)) + return __this_address; + if (block->bb_u.l.bb_pad != cpu_to_be32(0)) + return __this_address; } - lblock_ok = lblock_ok && - be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) && - be16_to_cpu(block->bb_level) == level && - be16_to_cpu(block->bb_numrecs) <= - cur->bc_ops->get_maxrecs(cur, level) && - block->bb_u.l.bb_leftsib && - (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && - block->bb_u.l.bb_rightsib && - (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); - - if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, + if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) + return __this_address; + if (be16_to_cpu(block->bb_level) != level) + return __this_address; + if (be16_to_cpu(block->bb_numrecs) > + cur->bc_ops->get_maxrecs(cur, level)) + return __this_address; + if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && + !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib), + level + 1)) + return __this_address; + if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && + !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib), + level + 1)) + return __this_address; + + return NULL; +} + +/* Check a long btree block header. */ +static int +xfs_btree_check_lblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_failaddr_t fa; + + fa = __xfs_btree_check_lblock(cur, block, level, bp); + if (unlikely(XFS_TEST_ERROR(fa != NULL, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); @@ -110,48 +130,61 @@ xfs_btree_check_lblock( return 0; } -STATIC int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sblock( - struct xfs_btree_cur *cur, /* btree cursor */ - struct xfs_btree_block *block, /* btree short form block pointer */ - int level, /* level of the btree block */ - struct xfs_buf *bp) /* buffer containing block */ +/* + * Check a short btree block header. Return the address of the failing check, + * or NULL if everything is ok. + */ +xfs_failaddr_t +__xfs_btree_check_sblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) { - struct xfs_mount *mp; /* file system mount point */ - struct xfs_buf *agbp; /* buffer for ag. freespace struct */ - struct xfs_agf *agf; /* ag. freespace structure */ - xfs_agblock_t agflen; /* native ag. freespace length */ - int sblock_ok = 1; /* block passes checks */ + struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; - int crc; - - mp = cur->bc_mp; - crc = xfs_sb_version_hascrc(&mp->m_sb); - agbp = cur->bc_private.a.agbp; - agf = XFS_BUF_TO_AGF(agbp); - agflen = be32_to_cpu(agf->agf_length); + int crc = xfs_sb_version_hascrc(&mp->m_sb); if (crc) { - sblock_ok = sblock_ok && - uuid_equal(&block->bb_u.s.bb_uuid, - &mp->m_sb.sb_meta_uuid) && - block->bb_u.s.bb_blkno == cpu_to_be64( - bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (block->bb_u.s.bb_blkno != + cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL)) + return __this_address; } - sblock_ok = sblock_ok && - be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) && - be16_to_cpu(block->bb_level) == level && - be16_to_cpu(block->bb_numrecs) <= - cur->bc_ops->get_maxrecs(cur, level) && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && - block->bb_u.s.bb_rightsib; - - if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, + if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) + return __this_address; + if (be16_to_cpu(block->bb_level) != level) + return __this_address; + if (be16_to_cpu(block->bb_numrecs) > + cur->bc_ops->get_maxrecs(cur, level)) + return __this_address; + if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && + !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib), + level + 1)) + return __this_address; + if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && + !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib), + level + 1)) + return __this_address; + + return NULL; +} + +/* Check a short btree block header. */ +STATIC int +xfs_btree_check_sblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_failaddr_t fa; + + fa = __xfs_btree_check_sblock(cur, block, level, bp); + if (unlikely(XFS_TEST_ERROR(fa != NULL, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); @@ -177,59 +210,53 @@ xfs_btree_check_block( return xfs_btree_check_sblock(cur, block, level, bp); } -/* - * Check that (long) pointer is ok. - */ -int /* error (0 or EFSCORRUPTED) */ +/* Check that this long pointer is valid and points within the fs. */ +bool xfs_btree_check_lptr( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_fsblock_t bno, /* btree block disk address */ - int level) /* btree block level */ + struct xfs_btree_cur *cur, + xfs_fsblock_t fsbno, + int level) { - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, - level > 0 && - bno != NULLFSBLOCK && - XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); - return 0; + if (level <= 0) + return false; + return xfs_verify_fsbno(cur->bc_mp, fsbno); } -#ifdef DEBUG -/* - * Check that (short) pointer is ok. - */ -STATIC int /* error (0 or EFSCORRUPTED) */ +/* Check that this short pointer is valid and points within the AG. */ +bool xfs_btree_check_sptr( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t bno, /* btree block disk address */ - int level) /* btree block level */ + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + int level) { - xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; - - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, - level > 0 && - bno != NULLAGBLOCK && - bno != 0 && - bno < agblocks); - return 0; + if (level <= 0) + return false; + return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno); } +#ifdef DEBUG /* - * Check that block ptr is ok. + * Check that a given (indexed) btree pointer at a certain level of a + * btree is valid and doesn't point past where it should. */ -STATIC int /* error (0 or EFSCORRUPTED) */ +static int xfs_btree_check_ptr( - struct xfs_btree_cur *cur, /* btree cursor */ - union xfs_btree_ptr *ptr, /* btree block disk address */ - int index, /* offset from ptr to check */ - int level) /* btree block level */ + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int index, + int level) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - return xfs_btree_check_lptr(cur, - be64_to_cpu((&ptr->l)[index]), level); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, + xfs_btree_check_lptr(cur, + be64_to_cpu((&ptr->l)[index]), level)); } else { - return xfs_btree_check_sptr(cur, - be32_to_cpu((&ptr->s)[index]), level); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, + xfs_btree_check_sptr(cur, + be32_to_cpu((&ptr->s)[index]), level)); } + + return 0; } #endif @@ -1027,7 +1054,7 @@ xfs_btree_setbuf( } } -STATIC int +bool xfs_btree_ptr_is_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) @@ -1052,7 +1079,7 @@ xfs_btree_set_ptr_null( /* * Get/set/init sibling pointers */ -STATIC void +void xfs_btree_get_sibling( struct xfs_btree_cur *cur, struct xfs_btree_block *block, @@ -2001,7 +2028,7 @@ error0: } /* Find the high key storage area from a regular key. */ -STATIC union xfs_btree_key * +union xfs_btree_key * xfs_btree_high_key_from_key( struct xfs_btree_cur *cur, union xfs_btree_key *key) @@ -2075,7 +2102,7 @@ xfs_btree_get_node_keys( } /* Derive the keys for any btree block. */ -STATIC void +void xfs_btree_get_keys( struct xfs_btree_cur *cur, struct xfs_btree_block *block, @@ -4914,3 +4941,15 @@ xfs_btree_count_blocks( return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, blocks); } + +/* Compare two btree pointers. */ +int64_t +xfs_btree_diff_two_ptrs( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *a, + const union xfs_btree_ptr *b) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); + return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index f2a88c3b1159..b57501c6f71d 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -255,6 +255,14 @@ typedef struct xfs_btree_cur */ #define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) +/* + * Internal long and short btree block checks. They return NULL if the + * block is ok or the address of the failed check otherwise. + */ +xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, int level, struct xfs_buf *bp); +xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, int level, struct xfs_buf *bp); /* * Check that block header is ok. @@ -269,10 +277,19 @@ xfs_btree_check_block( /* * Check that (long) pointer is ok. */ -int /* error (0 or EFSCORRUPTED) */ +bool /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lptr( struct xfs_btree_cur *cur, /* btree cursor */ - xfs_fsblock_t ptr, /* btree block disk address */ + xfs_fsblock_t fsbno, /* btree block disk address */ + int level); /* btree block level */ + +/* + * Check that (short) pointer is ok. + */ +bool /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t agbno, /* btree block disk address */ int level); /* btree block level */ /* @@ -517,5 +534,16 @@ int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); +bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); +int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *a, + const union xfs_btree_ptr *b); +void xfs_btree_get_sibling(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, int lr); +void xfs_btree_get_keys(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, union xfs_btree_key *key); +union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, + union xfs_btree_key *key); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h index 8211f48b98e6..999a290cfd72 100644 --- a/fs/xfs/libxfs/xfs_cksum.h +++ b/fs/xfs/libxfs/xfs_cksum.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XFS_CKSUM_H #define _XFS_CKSUM_H 1 diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 6d4335815c3f..651611530d2f 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -1466,6 +1466,7 @@ xfs_da3_node_lookup_int( int max; int error; int retval; + unsigned int expected_level = 0; struct xfs_inode *dp = state->args->dp; args = state->args; @@ -1474,7 +1475,7 @@ xfs_da3_node_lookup_int( * Descend thru the B-tree searching each level for the right * node to use, until the right hashval is found. */ - blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0; + blkno = args->geo->leafblk; for (blk = &state->path.blk[0], state->path.active = 1; state->path.active <= XFS_DA_NODE_MAXDEPTH; blk++, state->path.active++) { @@ -1517,6 +1518,18 @@ xfs_da3_node_lookup_int( dp->d_ops->node_hdr_from_disk(&nodehdr, node); btree = dp->d_ops->node_tree_p(node); + /* Tree taller than we can handle; bail out! */ + if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + return -EFSCORRUPTED; + + /* Check the level from the root. */ + if (blkno == args->geo->leafblk) + expected_level = nodehdr.level - 1; + else if (expected_level != nodehdr.level) + return -EFSCORRUPTED; + else + expected_level--; + max = nodehdr.count; blk->hashval = be32_to_cpu(btree[max - 1].hashval); @@ -1562,8 +1575,15 @@ xfs_da3_node_lookup_int( blk->index = probe; blkno = be32_to_cpu(btree[probe].before); } + + /* We can't point back to the root. */ + if (blkno == args->geo->leafblk) + return -EFSCORRUPTED; } + if (expected_level != 0) + return -EFSCORRUPTED; + /* * A leaf block that ends in the hashval that we are interested in * (final hashval == search hashval) means that the next block may diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index ccf9783fd3f0..e10778c102ea 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -30,6 +30,8 @@ #include "xfs_bmap.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_ialloc.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -38,7 +40,9 @@ struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; /* * Convert inode mode to directory entry filetype */ -unsigned char xfs_mode_to_ftype(int mode) +unsigned char +xfs_mode_to_ftype( + int mode) { switch (mode & S_IFMT) { case S_IFREG: @@ -202,22 +206,8 @@ xfs_dir_ino_validate( xfs_mount_t *mp, xfs_ino_t ino) { - xfs_agblock_t agblkno; - xfs_agino_t agino; - xfs_agnumber_t agno; - int ino_ok; - int ioff; - - agno = XFS_INO_TO_AGNO(mp, ino); - agblkno = XFS_INO_TO_AGBNO(mp, ino); - ioff = XFS_INO_TO_OFFSET(mp, ino); - agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff); - ino_ok = - agno < mp->m_sb.sb_agcount && - agblkno < mp->m_sb.sb_agblocks && - agblkno != 0 && - ioff < (1 << mp->m_sb.sb_inopblog) && - XFS_AGINO_TO_INO(mp, agno, agino) == ino; + bool ino_ok = xfs_verify_dir_ino(mp, ino); + if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 21c8f8bf94d5..1a8f2cf977ca 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -324,4 +324,21 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) sizeof(struct xfs_dir2_leaf_tail)); } +/* + * The Linux API doesn't pass down the total size of the buffer + * we read into down to the filesystem. With the filldir concept + * it's not needed for correct information, but the XFS dir2 leaf + * code wants an estimate of the buffer size to calculate it's + * readahead window and size the buffers used for mapping to + * physical blocks. + * + * Try to give it an estimate that's good enough, maybe at some + * point we can change the ->readdir prototype to include the + * buffer size. For now we use the current glibc buffer size. + * musl libc hardcodes 2k and dietlibc uses PAGE_SIZE. + */ +#define XFS_READDIR_BUFSIZE (32768) + +unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h new file mode 100644 index 000000000000..bc1789d95152 --- /dev/null +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (C) 2017 Oracle. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_ERRORTAG_H_ +#define __XFS_ERRORTAG_H_ + +/* + * error injection tags - the labels can be anything you want + * but each tag should have its own unique number + */ + +#define XFS_ERRTAG_NOERROR 0 +#define XFS_ERRTAG_IFLUSH_1 1 +#define XFS_ERRTAG_IFLUSH_2 2 +#define XFS_ERRTAG_IFLUSH_3 3 +#define XFS_ERRTAG_IFLUSH_4 4 +#define XFS_ERRTAG_IFLUSH_5 5 +#define XFS_ERRTAG_IFLUSH_6 6 +#define XFS_ERRTAG_DA_READ_BUF 7 +#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8 +#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9 +#define XFS_ERRTAG_ALLOC_READ_AGF 10 +#define XFS_ERRTAG_IALLOC_READ_AGI 11 +#define XFS_ERRTAG_ITOBP_INOTOBP 12 +#define XFS_ERRTAG_IUNLINK 13 +#define XFS_ERRTAG_IUNLINK_REMOVE 14 +#define XFS_ERRTAG_DIR_INO_VALIDATE 15 +#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16 +#define XFS_ERRTAG_IODONE_IOERR 17 +#define XFS_ERRTAG_STRATREAD_IOERR 18 +#define XFS_ERRTAG_STRATCMPL_IOERR 19 +#define XFS_ERRTAG_DIOWRITE_IOERR 20 +#define XFS_ERRTAG_BMAPIFORMAT 21 +#define XFS_ERRTAG_FREE_EXTENT 22 +#define XFS_ERRTAG_RMAP_FINISH_ONE 23 +#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24 +#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 +#define XFS_ERRTAG_BMAP_FINISH_ONE 26 +#define XFS_ERRTAG_AG_RESV_CRITICAL 27 +/* + * DEBUG mode instrumentation to test and/or trigger delayed allocation + * block killing in the event of failed writes. When enabled, all + * buffered writes are silenty dropped and handled as if they failed. + * All delalloc blocks in the range of the write (including pre-existing + * delalloc blocks!) are tossed as part of the write failure error + * handling sequence. + */ +#define XFS_ERRTAG_DROP_WRITES 28 +#define XFS_ERRTAG_LOG_BAD_CRC 29 +#define XFS_ERRTAG_LOG_ITEM_PIN 30 +#define XFS_ERRTAG_BUF_LRU_REF 31 +#define XFS_ERRTAG_MAX 32 + +/* + * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. + */ +#define XFS_RANDOM_DEFAULT 100 +#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT +#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) +#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT +#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT +#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT +#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT +#define XFS_RANDOM_FREE_EXTENT 1 +#define XFS_RANDOM_RMAP_FINISH_ONE 1 +#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 +#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 +#define XFS_RANDOM_BMAP_FINISH_ONE 1 +#define XFS_RANDOM_AG_RESV_CRITICAL 4 +#define XFS_RANDOM_DROP_WRITES 1 +#define XFS_RANDOM_LOG_BAD_CRC 1 +#define XFS_RANDOM_LOG_ITEM_PIN 1 +#define XFS_RANDOM_BUF_LRU_REF 2 + +#endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 23229f0c5b15..1acb584fc5f7 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -315,6 +315,11 @@ static inline bool xfs_sb_good_version(struct xfs_sb *sbp) return false; } +static inline bool xfs_sb_version_hasrealtime(struct xfs_sb *sbp) +{ + return sbp->sb_rblocks > 0; +} + /* * Detect a mismatched features2 field. Older kernels read/wrote * this into the wrong slot, so to be safe we keep them in sync. @@ -500,12 +505,12 @@ xfs_sb_has_incompat_log_feature( /* * V5 superblock specific feature checks */ -static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) +static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } -static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) +static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } @@ -518,7 +523,7 @@ static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); } -static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasfinobt(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); @@ -941,7 +946,7 @@ typedef enum xfs_dinode_fmt { XFS_DINODE_FMT_LOCAL, /* bulk data */ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ - XFS_DINODE_FMT_UUID /* uuid_t */ + XFS_DINODE_FMT_UUID /* added long ago, but never used */ } xfs_dinode_fmt_t; /* @@ -1142,7 +1147,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * Dquot and dquot block format definitions */ #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ -#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ +#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ /* * This is the main portion of the on-disk representation of quota @@ -1548,10 +1553,6 @@ typedef struct xfs_bmbt_rec { typedef uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; -typedef struct xfs_bmbt_rec_host { - uint64_t l0, l1; -} xfs_bmbt_rec_host_t; - /* * Values and macros for delayed-allocation startblock fields. */ @@ -1577,24 +1578,6 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x) } /* - * Possible extent states. - */ -typedef enum { - XFS_EXT_NORM, XFS_EXT_UNWRITTEN, -} xfs_exntst_t; - -/* - * Incore version of above. - */ -typedef struct xfs_bmbt_irec -{ - xfs_fileoff_t br_startoff; /* starting file offset */ - xfs_fsblock_t br_startblock; /* starting block number */ - xfs_filblks_t br_blockcount; /* number of blocks */ - xfs_exntst_t br_state; /* extent state */ -} xfs_bmbt_irec_t; - -/* * Key structure for non-leaf levels of the tree. */ typedef struct xfs_bmbt_key { diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 8c61f21535d4..b90924104596 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -468,6 +468,82 @@ typedef struct xfs_swapext #define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ +/* metadata scrubbing */ +struct xfs_scrub_metadata { + __u32 sm_type; /* What to check? */ + __u32 sm_flags; /* flags; see below. */ + __u64 sm_ino; /* inode number. */ + __u32 sm_gen; /* inode generation. */ + __u32 sm_agno; /* ag number. */ + __u64 sm_reserved[5]; /* pad to 64 bytes */ +}; + +/* + * Metadata types and flags for scrub operation. + */ + +/* Scrub subcommands. */ +#define XFS_SCRUB_TYPE_PROBE 0 /* presence test ioctl */ +#define XFS_SCRUB_TYPE_SB 1 /* superblock */ +#define XFS_SCRUB_TYPE_AGF 2 /* AG free header */ +#define XFS_SCRUB_TYPE_AGFL 3 /* AG free list */ +#define XFS_SCRUB_TYPE_AGI 4 /* AG inode header */ +#define XFS_SCRUB_TYPE_BNOBT 5 /* freesp by block btree */ +#define XFS_SCRUB_TYPE_CNTBT 6 /* freesp by length btree */ +#define XFS_SCRUB_TYPE_INOBT 7 /* inode btree */ +#define XFS_SCRUB_TYPE_FINOBT 8 /* free inode btree */ +#define XFS_SCRUB_TYPE_RMAPBT 9 /* reverse mapping btree */ +#define XFS_SCRUB_TYPE_REFCNTBT 10 /* reference count btree */ +#define XFS_SCRUB_TYPE_INODE 11 /* inode record */ +#define XFS_SCRUB_TYPE_BMBTD 12 /* data fork block mapping */ +#define XFS_SCRUB_TYPE_BMBTA 13 /* attr fork block mapping */ +#define XFS_SCRUB_TYPE_BMBTC 14 /* CoW fork block mapping */ +#define XFS_SCRUB_TYPE_DIR 15 /* directory */ +#define XFS_SCRUB_TYPE_XATTR 16 /* extended attribute */ +#define XFS_SCRUB_TYPE_SYMLINK 17 /* symbolic link */ +#define XFS_SCRUB_TYPE_PARENT 18 /* parent pointers */ +#define XFS_SCRUB_TYPE_RTBITMAP 19 /* realtime bitmap */ +#define XFS_SCRUB_TYPE_RTSUM 20 /* realtime summary */ +#define XFS_SCRUB_TYPE_UQUOTA 21 /* user quotas */ +#define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */ +#define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */ + +/* Number of scrub subcommands. */ +#define XFS_SCRUB_TYPE_NR 24 + +/* i: Repair this metadata. */ +#define XFS_SCRUB_IFLAG_REPAIR (1 << 0) + +/* o: Metadata object needs repair. */ +#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1) + +/* + * o: Metadata object could be optimized. It's not corrupt, but + * we could improve on it somehow. + */ +#define XFS_SCRUB_OFLAG_PREEN (1 << 2) + +/* o: Cross-referencing failed. */ +#define XFS_SCRUB_OFLAG_XFAIL (1 << 3) + +/* o: Metadata object disagrees with cross-referenced metadata. */ +#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4) + +/* o: Scan was not complete. */ +#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5) + +/* o: Metadata object looked funny but isn't corrupt. */ +#define XFS_SCRUB_OFLAG_WARNING (1 << 6) + +#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) +#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ + XFS_SCRUB_OFLAG_PREEN | \ + XFS_SCRUB_OFLAG_XFAIL | \ + XFS_SCRUB_OFLAG_XCORRUPT | \ + XFS_SCRUB_OFLAG_INCOMPLETE | \ + XFS_SCRUB_OFLAG_WARNING) +#define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT) + /* * ioctl limits */ @@ -511,6 +587,7 @@ typedef struct xfs_swapext #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) #define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) /* XFS_IOC_GETFSMAP ------ hoisted 59 */ +#define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 988bb3f31446..de3f04a98656 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -31,6 +31,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_cksum.h" @@ -1962,7 +1963,7 @@ xfs_difree_inobt( if (!(mp->m_flags & XFS_MOUNT_IKEEP) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { - xic->deleted = 1; + xic->deleted = true; xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); @@ -1989,7 +1990,7 @@ xfs_difree_inobt( xfs_difree_inode_chunk(mp, agno, &rec, dfops); } else { - xic->deleted = 0; + xic->deleted = false; error = xfs_inobt_update(cur, &rec); if (error) { @@ -2664,3 +2665,93 @@ xfs_ialloc_pagi_init( xfs_trans_brelse(tp, bp); return 0; } + +/* Calculate the first and last possible inode number in an AG. */ +void +xfs_ialloc_agino_range( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t *first, + xfs_agino_t *last) +{ + xfs_agblock_t bno; + xfs_agblock_t eoag; + + eoag = xfs_ag_block_count(mp, agno); + + /* + * Calculate the first inode, which will be in the first + * cluster-aligned block after the AGFL. + */ + bno = round_up(XFS_AGFL_BLOCK(mp) + 1, + xfs_ialloc_cluster_alignment(mp)); + *first = XFS_OFFBNO_TO_AGINO(mp, bno, 0); + + /* + * Calculate the last inode, which will be at the end of the + * last (aligned) cluster that can be allocated in the AG. + */ + bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp)); + *last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1; +} + +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata. + */ +bool +xfs_verify_agino( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + xfs_agino_t first; + xfs_agino_t last; + + xfs_ialloc_agino_range(mp, agno, &first, &last); + return agino >= first && agino <= last; +} + +/* + * Verify that an FS inode number pointer neither points outside the + * filesystem nor points at static AG metadata. + */ +bool +xfs_verify_ino( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + + if (agno >= mp->m_sb.sb_agcount) + return false; + if (XFS_AGINO_TO_INO(mp, agno, agino) != ino) + return false; + return xfs_verify_agino(mp, agno, agino); +} + +/* Is this an internal inode number? */ +bool +xfs_internal_inum( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || + (xfs_sb_version_hasquota(&mp->m_sb) && + xfs_is_quota_inode(&mp->m_sb, ino)); +} + +/* + * Verify that a directory entry's inode number doesn't point at an internal + * inode, empty space, or static AG metadata. + */ +bool +xfs_verify_dir_ino( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + if (xfs_internal_inum(mp, ino)) + return false; + return xfs_verify_ino(mp, ino); +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index b32cfb5aeb5b..d2bdcd5e7312 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -173,5 +173,12 @@ void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); +void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t *first, xfs_agino_t *last); +bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t agino); +bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c new file mode 100644 index 000000000000..343a94246f5b --- /dev/null +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -0,0 +1,1043 @@ +/* + * Copyright (c) 2017 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/cache.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trace.h" + +/* + * In-core extent record layout: + * + * +-------+----------------------------+ + * | 00:53 | all 54 bits of startoff | + * | 54:63 | low 10 bits of startblock | + * +-------+----------------------------+ + * | 00:20 | all 21 bits of length | + * | 21 | unwritten extent bit | + * | 22:63 | high 42 bits of startblock | + * +-------+----------------------------+ + */ +#define XFS_IEXT_STARTOFF_MASK xfs_mask64lo(BMBT_STARTOFF_BITLEN) +#define XFS_IEXT_LENGTH_MASK xfs_mask64lo(BMBT_BLOCKCOUNT_BITLEN) +#define XFS_IEXT_STARTBLOCK_MASK xfs_mask64lo(BMBT_STARTBLOCK_BITLEN) + +struct xfs_iext_rec { + uint64_t lo; + uint64_t hi; +}; + +/* + * Given that the length can't be a zero, only an empty hi value indicates an + * unused record. + */ +static bool xfs_iext_rec_is_empty(struct xfs_iext_rec *rec) +{ + return rec->hi == 0; +} + +static inline void xfs_iext_rec_clear(struct xfs_iext_rec *rec) +{ + rec->lo = 0; + rec->hi = 0; +} + +static void +xfs_iext_set( + struct xfs_iext_rec *rec, + struct xfs_bmbt_irec *irec) +{ + ASSERT((irec->br_startoff & ~XFS_IEXT_STARTOFF_MASK) == 0); + ASSERT((irec->br_blockcount & ~XFS_IEXT_LENGTH_MASK) == 0); + ASSERT((irec->br_startblock & ~XFS_IEXT_STARTBLOCK_MASK) == 0); + + rec->lo = irec->br_startoff & XFS_IEXT_STARTOFF_MASK; + rec->hi = irec->br_blockcount & XFS_IEXT_LENGTH_MASK; + + rec->lo |= (irec->br_startblock << 54); + rec->hi |= ((irec->br_startblock & ~xfs_mask64lo(10)) << (22 - 10)); + + if (irec->br_state == XFS_EXT_UNWRITTEN) + rec->hi |= (1 << 21); +} + +static void +xfs_iext_get( + struct xfs_bmbt_irec *irec, + struct xfs_iext_rec *rec) +{ + irec->br_startoff = rec->lo & XFS_IEXT_STARTOFF_MASK; + irec->br_blockcount = rec->hi & XFS_IEXT_LENGTH_MASK; + + irec->br_startblock = rec->lo >> 54; + irec->br_startblock |= (rec->hi & xfs_mask64hi(42)) >> (22 - 10); + + if (rec->hi & (1 << 21)) + irec->br_state = XFS_EXT_UNWRITTEN; + else + irec->br_state = XFS_EXT_NORM; +} + +enum { + NODE_SIZE = 256, + KEYS_PER_NODE = NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)), + RECS_PER_LEAF = (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) / + sizeof(struct xfs_iext_rec), +}; + +/* + * In-core extent btree block layout: + * + * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks. + * + * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each + * contain the startoffset, blockcount, startblock and unwritten extent flag. + * See above for the exact format, followed by pointers to the previous and next + * leaf blocks (if there are any). + * + * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed + * by an equal number of pointers to the btree blocks at the next lower level. + * + * +-------+-------+-------+-------+-------+----------+----------+ + * Leaf: | rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr | + * +-------+-------+-------+-------+-------+----------+----------+ + * + * +-------+-------+-------+-------+-------+-------+------+-------+ + * Inner: | key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N | + * +-------+-------+-------+-------+-------+-------+------+-------+ + */ +struct xfs_iext_node { + uint64_t keys[KEYS_PER_NODE]; +#define XFS_IEXT_KEY_INVALID (1ULL << 63) + void *ptrs[KEYS_PER_NODE]; +}; + +struct xfs_iext_leaf { + struct xfs_iext_rec recs[RECS_PER_LEAF]; + struct xfs_iext_leaf *prev; + struct xfs_iext_leaf *next; +}; + +inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp) +{ + return ifp->if_bytes / sizeof(struct xfs_iext_rec); +} + +static inline int xfs_iext_max_recs(struct xfs_ifork *ifp) +{ + if (ifp->if_height == 1) + return xfs_iext_count(ifp); + return RECS_PER_LEAF; +} + +static inline struct xfs_iext_rec *cur_rec(struct xfs_iext_cursor *cur) +{ + return &cur->leaf->recs[cur->pos]; +} + +static inline bool xfs_iext_valid(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + if (!cur->leaf) + return false; + if (cur->pos < 0 || cur->pos >= xfs_iext_max_recs(ifp)) + return false; + if (xfs_iext_rec_is_empty(cur_rec(cur))) + return false; + return true; +} + +static void * +xfs_iext_find_first_leaf( + struct xfs_ifork *ifp) +{ + struct xfs_iext_node *node = ifp->if_u1.if_root; + int height; + + if (!ifp->if_height) + return NULL; + + for (height = ifp->if_height; height > 1; height--) { + node = node->ptrs[0]; + ASSERT(node); + } + + return node; +} + +static void * +xfs_iext_find_last_leaf( + struct xfs_ifork *ifp) +{ + struct xfs_iext_node *node = ifp->if_u1.if_root; + int height, i; + + if (!ifp->if_height) + return NULL; + + for (height = ifp->if_height; height > 1; height--) { + for (i = 1; i < KEYS_PER_NODE; i++) + if (!node->ptrs[i]) + break; + node = node->ptrs[i - 1]; + ASSERT(node); + } + + return node; +} + +void +xfs_iext_first( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + cur->pos = 0; + cur->leaf = xfs_iext_find_first_leaf(ifp); +} + +void +xfs_iext_last( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + int i; + + cur->leaf = xfs_iext_find_last_leaf(ifp); + if (!cur->leaf) { + cur->pos = 0; + return; + } + + for (i = 1; i < xfs_iext_max_recs(ifp); i++) { + if (xfs_iext_rec_is_empty(&cur->leaf->recs[i])) + break; + } + cur->pos = i - 1; +} + +void +xfs_iext_next( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + if (!cur->leaf) { + ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF); + xfs_iext_first(ifp, cur); + return; + } + + ASSERT(cur->pos >= 0); + ASSERT(cur->pos < xfs_iext_max_recs(ifp)); + + cur->pos++; + if (ifp->if_height > 1 && !xfs_iext_valid(ifp, cur) && + cur->leaf->next) { + cur->leaf = cur->leaf->next; + cur->pos = 0; + } +} + +void +xfs_iext_prev( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + if (!cur->leaf) { + ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF); + xfs_iext_last(ifp, cur); + return; + } + + ASSERT(cur->pos >= 0); + ASSERT(cur->pos <= RECS_PER_LEAF); + +recurse: + do { + cur->pos--; + if (xfs_iext_valid(ifp, cur)) + return; + } while (cur->pos > 0); + + if (ifp->if_height > 1 && cur->leaf->prev) { + cur->leaf = cur->leaf->prev; + cur->pos = RECS_PER_LEAF; + goto recurse; + } +} + +static inline int +xfs_iext_key_cmp( + struct xfs_iext_node *node, + int n, + xfs_fileoff_t offset) +{ + if (node->keys[n] > offset) + return 1; + if (node->keys[n] < offset) + return -1; + return 0; +} + +static inline int +xfs_iext_rec_cmp( + struct xfs_iext_rec *rec, + xfs_fileoff_t offset) +{ + uint64_t rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK; + u32 rec_len = rec->hi & XFS_IEXT_LENGTH_MASK; + + if (rec_offset > offset) + return 1; + if (rec_offset + rec_len <= offset) + return -1; + return 0; +} + +static void * +xfs_iext_find_level( + struct xfs_ifork *ifp, + xfs_fileoff_t offset, + int level) +{ + struct xfs_iext_node *node = ifp->if_u1.if_root; + int height, i; + + if (!ifp->if_height) + return NULL; + + for (height = ifp->if_height; height > level; height--) { + for (i = 1; i < KEYS_PER_NODE; i++) + if (xfs_iext_key_cmp(node, i, offset) > 0) + break; + + node = node->ptrs[i - 1]; + if (!node) + break; + } + + return node; +} + +static int +xfs_iext_node_pos( + struct xfs_iext_node *node, + xfs_fileoff_t offset) +{ + int i; + + for (i = 1; i < KEYS_PER_NODE; i++) { + if (xfs_iext_key_cmp(node, i, offset) > 0) + break; + } + + return i - 1; +} + +static int +xfs_iext_node_insert_pos( + struct xfs_iext_node *node, + xfs_fileoff_t offset) +{ + int i; + + for (i = 0; i < KEYS_PER_NODE; i++) { + if (xfs_iext_key_cmp(node, i, offset) > 0) + return i; + } + + return KEYS_PER_NODE; +} + +static int +xfs_iext_node_nr_entries( + struct xfs_iext_node *node, + int start) +{ + int i; + + for (i = start; i < KEYS_PER_NODE; i++) { + if (node->keys[i] == XFS_IEXT_KEY_INVALID) + break; + } + + return i; +} + +static int +xfs_iext_leaf_nr_entries( + struct xfs_ifork *ifp, + struct xfs_iext_leaf *leaf, + int start) +{ + int i; + + for (i = start; i < xfs_iext_max_recs(ifp); i++) { + if (xfs_iext_rec_is_empty(&leaf->recs[i])) + break; + } + + return i; +} + +static inline uint64_t +xfs_iext_leaf_key( + struct xfs_iext_leaf *leaf, + int n) +{ + return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK; +} + +static void +xfs_iext_grow( + struct xfs_ifork *ifp) +{ + struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS); + int i; + + if (ifp->if_height == 1) { + struct xfs_iext_leaf *prev = ifp->if_u1.if_root; + + node->keys[0] = xfs_iext_leaf_key(prev, 0); + node->ptrs[0] = prev; + } else { + struct xfs_iext_node *prev = ifp->if_u1.if_root; + + ASSERT(ifp->if_height > 1); + + node->keys[0] = prev->keys[0]; + node->ptrs[0] = prev; + } + + for (i = 1; i < KEYS_PER_NODE; i++) + node->keys[i] = XFS_IEXT_KEY_INVALID; + + ifp->if_u1.if_root = node; + ifp->if_height++; +} + +static void +xfs_iext_update_node( + struct xfs_ifork *ifp, + xfs_fileoff_t old_offset, + xfs_fileoff_t new_offset, + int level, + void *ptr) +{ + struct xfs_iext_node *node = ifp->if_u1.if_root; + int height, i; + + for (height = ifp->if_height; height > level; height--) { + for (i = 0; i < KEYS_PER_NODE; i++) { + if (i > 0 && xfs_iext_key_cmp(node, i, old_offset) > 0) + break; + if (node->keys[i] == old_offset) + node->keys[i] = new_offset; + } + node = node->ptrs[i - 1]; + ASSERT(node); + } + + ASSERT(node == ptr); +} + +static struct xfs_iext_node * +xfs_iext_split_node( + struct xfs_iext_node **nodep, + int *pos, + int *nr_entries) +{ + struct xfs_iext_node *node = *nodep; + struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS); + const int nr_move = KEYS_PER_NODE / 2; + int nr_keep = nr_move + (KEYS_PER_NODE & 1); + int i = 0; + + /* for sequential append operations just spill over into the new node */ + if (*pos == KEYS_PER_NODE) { + *nodep = new; + *pos = 0; + *nr_entries = 0; + goto done; + } + + + for (i = 0; i < nr_move; i++) { + new->keys[i] = node->keys[nr_keep + i]; + new->ptrs[i] = node->ptrs[nr_keep + i]; + + node->keys[nr_keep + i] = XFS_IEXT_KEY_INVALID; + node->ptrs[nr_keep + i] = NULL; + } + + if (*pos >= nr_keep) { + *nodep = new; + *pos -= nr_keep; + *nr_entries = nr_move; + } else { + *nr_entries = nr_keep; + } +done: + for (; i < KEYS_PER_NODE; i++) + new->keys[i] = XFS_IEXT_KEY_INVALID; + return new; +} + +static void +xfs_iext_insert_node( + struct xfs_ifork *ifp, + uint64_t offset, + void *ptr, + int level) +{ + struct xfs_iext_node *node, *new; + int i, pos, nr_entries; + +again: + if (ifp->if_height < level) + xfs_iext_grow(ifp); + + new = NULL; + node = xfs_iext_find_level(ifp, offset, level); + pos = xfs_iext_node_insert_pos(node, offset); + nr_entries = xfs_iext_node_nr_entries(node, pos); + + ASSERT(pos >= nr_entries || xfs_iext_key_cmp(node, pos, offset) != 0); + ASSERT(nr_entries <= KEYS_PER_NODE); + + if (nr_entries == KEYS_PER_NODE) + new = xfs_iext_split_node(&node, &pos, &nr_entries); + + /* + * Update the pointers in higher levels if the first entry changes + * in an existing node. + */ + if (node != new && pos == 0 && nr_entries > 0) + xfs_iext_update_node(ifp, node->keys[0], offset, level, node); + + for (i = nr_entries; i > pos; i--) { + node->keys[i] = node->keys[i - 1]; + node->ptrs[i] = node->ptrs[i - 1]; + } + node->keys[pos] = offset; + node->ptrs[pos] = ptr; + + if (new) { + offset = new->keys[0]; + ptr = new; + level++; + goto again; + } +} + +static struct xfs_iext_leaf * +xfs_iext_split_leaf( + struct xfs_iext_cursor *cur, + int *nr_entries) +{ + struct xfs_iext_leaf *leaf = cur->leaf; + struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS); + const int nr_move = RECS_PER_LEAF / 2; + int nr_keep = nr_move + (RECS_PER_LEAF & 1); + int i; + + /* for sequential append operations just spill over into the new node */ + if (cur->pos == RECS_PER_LEAF) { + cur->leaf = new; + cur->pos = 0; + *nr_entries = 0; + goto done; + } + + for (i = 0; i < nr_move; i++) { + new->recs[i] = leaf->recs[nr_keep + i]; + xfs_iext_rec_clear(&leaf->recs[nr_keep + i]); + } + + if (cur->pos >= nr_keep) { + cur->leaf = new; + cur->pos -= nr_keep; + *nr_entries = nr_move; + } else { + *nr_entries = nr_keep; + } +done: + if (leaf->next) + leaf->next->prev = new; + new->next = leaf->next; + new->prev = leaf; + leaf->next = new; + return new; +} + +static void +xfs_iext_alloc_root( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + ASSERT(ifp->if_bytes == 0); + + ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); + ifp->if_height = 1; + + /* now that we have a node step into it */ + cur->leaf = ifp->if_u1.if_root; + cur->pos = 0; +} + +static void +xfs_iext_realloc_root( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur) +{ + size_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec); + void *new; + + /* account for the prev/next pointers */ + if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) + new_size = NODE_SIZE; + + new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS); + memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); + ifp->if_u1.if_root = new; + cur->leaf = new; +} + +void +xfs_iext_insert( + struct xfs_inode *ip, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec, + int state) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + xfs_fileoff_t offset = irec->br_startoff; + struct xfs_iext_leaf *new = NULL; + int nr_entries, i; + + trace_xfs_iext_insert(ip, cur, state, _RET_IP_); + + if (ifp->if_height == 0) + xfs_iext_alloc_root(ifp, cur); + else if (ifp->if_height == 1) + xfs_iext_realloc_root(ifp, cur); + + nr_entries = xfs_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos); + ASSERT(nr_entries <= RECS_PER_LEAF); + ASSERT(cur->pos >= nr_entries || + xfs_iext_rec_cmp(cur_rec(cur), irec->br_startoff) != 0); + + if (nr_entries == RECS_PER_LEAF) + new = xfs_iext_split_leaf(cur, &nr_entries); + + /* + * Update the pointers in higher levels if the first entry changes + * in an existing node. + */ + if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) { + xfs_iext_update_node(ifp, xfs_iext_leaf_key(cur->leaf, 0), + offset, 1, cur->leaf); + } + + for (i = nr_entries; i > cur->pos; i--) + cur->leaf->recs[i] = cur->leaf->recs[i - 1]; + xfs_iext_set(cur_rec(cur), irec); + ifp->if_bytes += sizeof(struct xfs_iext_rec); + + if (new) + xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2); +} + +static struct xfs_iext_node * +xfs_iext_rebalance_node( + struct xfs_iext_node *parent, + int *pos, + struct xfs_iext_node *node, + int nr_entries) +{ + /* + * If the neighbouring nodes are completely full, or have different + * parents, we might never be able to merge our node, and will only + * delete it once the number of entries hits zero. + */ + if (nr_entries == 0) + return node; + + if (*pos > 0) { + struct xfs_iext_node *prev = parent->ptrs[*pos - 1]; + int nr_prev = xfs_iext_node_nr_entries(prev, 0), i; + + if (nr_prev + nr_entries <= KEYS_PER_NODE) { + for (i = 0; i < nr_entries; i++) { + prev->keys[nr_prev + i] = node->keys[i]; + prev->ptrs[nr_prev + i] = node->ptrs[i]; + } + return node; + } + } + + if (*pos + 1 < xfs_iext_node_nr_entries(parent, *pos)) { + struct xfs_iext_node *next = parent->ptrs[*pos + 1]; + int nr_next = xfs_iext_node_nr_entries(next, 0), i; + + if (nr_entries + nr_next <= KEYS_PER_NODE) { + /* + * Merge the next node into this node so that we don't + * have to do an additional update of the keys in the + * higher levels. + */ + for (i = 0; i < nr_next; i++) { + node->keys[nr_entries + i] = next->keys[i]; + node->ptrs[nr_entries + i] = next->ptrs[i]; + } + + ++*pos; + return next; + } + } + + return NULL; +} + +static void +xfs_iext_remove_node( + struct xfs_ifork *ifp, + xfs_fileoff_t offset, + void *victim) +{ + struct xfs_iext_node *node, *parent; + int level = 2, pos, nr_entries, i; + + ASSERT(level <= ifp->if_height); + node = xfs_iext_find_level(ifp, offset, level); + pos = xfs_iext_node_pos(node, offset); +again: + ASSERT(node->ptrs[pos]); + ASSERT(node->ptrs[pos] == victim); + kmem_free(victim); + + nr_entries = xfs_iext_node_nr_entries(node, pos) - 1; + offset = node->keys[0]; + for (i = pos; i < nr_entries; i++) { + node->keys[i] = node->keys[i + 1]; + node->ptrs[i] = node->ptrs[i + 1]; + } + node->keys[nr_entries] = XFS_IEXT_KEY_INVALID; + node->ptrs[nr_entries] = NULL; + + if (pos == 0 && nr_entries > 0) { + xfs_iext_update_node(ifp, offset, node->keys[0], level, node); + offset = node->keys[0]; + } + + if (nr_entries >= KEYS_PER_NODE / 2) + return; + + if (level < ifp->if_height) { + /* + * If we aren't at the root yet try to find a neighbour node to + * merge with (or delete the node if it is empty), and then + * recurse up to the next level. + */ + level++; + parent = xfs_iext_find_level(ifp, offset, level); + pos = xfs_iext_node_pos(parent, offset); + + ASSERT(pos != KEYS_PER_NODE); + ASSERT(parent->ptrs[pos] == node); + + node = xfs_iext_rebalance_node(parent, &pos, node, nr_entries); + if (node) { + victim = node; + node = parent; + goto again; + } + } else if (nr_entries == 1) { + /* + * If we are at the root and only one entry is left we can just + * free this node and update the root pointer. + */ + ASSERT(node == ifp->if_u1.if_root); + ifp->if_u1.if_root = node->ptrs[0]; + ifp->if_height--; + kmem_free(node); + } +} + +static void +xfs_iext_rebalance_leaf( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_iext_leaf *leaf, + xfs_fileoff_t offset, + int nr_entries) +{ + /* + * If the neighbouring nodes are completely full we might never be able + * to merge our node, and will only delete it once the number of + * entries hits zero. + */ + if (nr_entries == 0) + goto remove_node; + + if (leaf->prev) { + int nr_prev = xfs_iext_leaf_nr_entries(ifp, leaf->prev, 0), i; + + if (nr_prev + nr_entries <= RECS_PER_LEAF) { + for (i = 0; i < nr_entries; i++) + leaf->prev->recs[nr_prev + i] = leaf->recs[i]; + + if (cur->leaf == leaf) { + cur->leaf = leaf->prev; + cur->pos += nr_prev; + } + goto remove_node; + } + } + + if (leaf->next) { + int nr_next = xfs_iext_leaf_nr_entries(ifp, leaf->next, 0), i; + + if (nr_entries + nr_next <= RECS_PER_LEAF) { + /* + * Merge the next node into this node so that we don't + * have to do an additional update of the keys in the + * higher levels. + */ + for (i = 0; i < nr_next; i++) { + leaf->recs[nr_entries + i] = + leaf->next->recs[i]; + } + + if (cur->leaf == leaf->next) { + cur->leaf = leaf; + cur->pos += nr_entries; + } + + offset = xfs_iext_leaf_key(leaf->next, 0); + leaf = leaf->next; + goto remove_node; + } + } + + return; +remove_node: + if (leaf->prev) + leaf->prev->next = leaf->next; + if (leaf->next) + leaf->next->prev = leaf->prev; + xfs_iext_remove_node(ifp, offset, leaf); +} + +static void +xfs_iext_free_last_leaf( + struct xfs_ifork *ifp) +{ + ifp->if_u1.if_root = NULL; + ifp->if_height--; + kmem_free(ifp->if_u1.if_root); +} + +void +xfs_iext_remove( + struct xfs_inode *ip, + struct xfs_iext_cursor *cur, + int state) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + struct xfs_iext_leaf *leaf = cur->leaf; + xfs_fileoff_t offset = xfs_iext_leaf_key(leaf, 0); + int i, nr_entries; + + trace_xfs_iext_remove(ip, cur, state, _RET_IP_); + + ASSERT(ifp->if_height > 0); + ASSERT(ifp->if_u1.if_root != NULL); + ASSERT(xfs_iext_valid(ifp, cur)); + + nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1; + for (i = cur->pos; i < nr_entries; i++) + leaf->recs[i] = leaf->recs[i + 1]; + xfs_iext_rec_clear(&leaf->recs[nr_entries]); + ifp->if_bytes -= sizeof(struct xfs_iext_rec); + + if (cur->pos == 0 && nr_entries > 0) { + xfs_iext_update_node(ifp, offset, xfs_iext_leaf_key(leaf, 0), 1, + leaf); + offset = xfs_iext_leaf_key(leaf, 0); + } else if (cur->pos == nr_entries) { + if (ifp->if_height > 1 && leaf->next) + cur->leaf = leaf->next; + else + cur->leaf = NULL; + cur->pos = 0; + } + + if (nr_entries >= RECS_PER_LEAF / 2) + return; + + if (ifp->if_height > 1) + xfs_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries); + else if (nr_entries == 0) + xfs_iext_free_last_leaf(ifp); +} + +/* + * Lookup the extent covering bno. + * + * If there is an extent covering bno return the extent index, and store the + * expanded extent structure in *gotp, and the extent cursor in *cur. + * If there is no extent covering bno, but there is an extent after it (e.g. + * it lies in a hole) return that extent in *gotp and its cursor in *cur + * instead. + * If bno is beyond the last extent return false, and return an invalid + * cursor value. + */ +bool +xfs_iext_lookup_extent( + struct xfs_inode *ip, + struct xfs_ifork *ifp, + xfs_fileoff_t offset, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *gotp) +{ + XFS_STATS_INC(ip->i_mount, xs_look_exlist); + + cur->leaf = xfs_iext_find_level(ifp, offset, 1); + if (!cur->leaf) { + cur->pos = 0; + return false; + } + + for (cur->pos = 0; cur->pos < xfs_iext_max_recs(ifp); cur->pos++) { + struct xfs_iext_rec *rec = cur_rec(cur); + + if (xfs_iext_rec_is_empty(rec)) + break; + if (xfs_iext_rec_cmp(rec, offset) >= 0) + goto found; + } + + /* Try looking in the next node for an entry > offset */ + if (ifp->if_height == 1 || !cur->leaf->next) + return false; + cur->leaf = cur->leaf->next; + cur->pos = 0; + if (!xfs_iext_valid(ifp, cur)) + return false; +found: + xfs_iext_get(gotp, cur_rec(cur)); + return true; +} + +/* + * Returns the last extent before end, and if this extent doesn't cover + * end, update end to the end of the extent. + */ +bool +xfs_iext_lookup_extent_before( + struct xfs_inode *ip, + struct xfs_ifork *ifp, + xfs_fileoff_t *end, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *gotp) +{ + /* could be optimized to not even look up the next on a match.. */ + if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) && + gotp->br_startoff <= *end - 1) + return true; + if (!xfs_iext_prev_extent(ifp, cur, gotp)) + return false; + *end = gotp->br_startoff + gotp->br_blockcount; + return true; +} + +void +xfs_iext_update_extent( + struct xfs_inode *ip, + int state, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *new) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + + if (cur->pos == 0) { + struct xfs_bmbt_irec old; + + xfs_iext_get(&old, cur_rec(cur)); + if (new->br_startoff != old.br_startoff) { + xfs_iext_update_node(ifp, old.br_startoff, + new->br_startoff, 1, cur->leaf); + } + } + + trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_); + xfs_iext_set(cur_rec(cur), new); + trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_); +} + +/* + * Return true if the cursor points at an extent and return the extent structure + * in gotp. Else return false. + */ +bool +xfs_iext_get_extent( + struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *gotp) +{ + if (!xfs_iext_valid(ifp, cur)) + return false; + xfs_iext_get(gotp, cur_rec(cur)); + return true; +} + +/* + * This is a recursive function, because of that we need to be extremely + * careful with stack usage. + */ +static void +xfs_iext_destroy_node( + struct xfs_iext_node *node, + int level) +{ + int i; + + if (level > 1) { + for (i = 0; i < KEYS_PER_NODE; i++) { + if (node->keys[i] == XFS_IEXT_KEY_INVALID) + break; + xfs_iext_destroy_node(node->ptrs[i], level - 1); + } + } + + kmem_free(node); +} + +void +xfs_iext_destroy( + struct xfs_ifork *ifp) +{ + xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height); + + ifp->if_bytes = 0; + ifp->if_height = 0; + ifp->if_u1.if_root = NULL; +} diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 378f8fbc91a7..6b7989038d75 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -24,6 +24,7 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_cksum.h" #include "xfs_icache.h" diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 31840ca24018..1c90ec41e9df 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -42,21 +42,27 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); +static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev) +{ + return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); +} + /* - * Move inode type and inode format specific information from the - * on-disk inode to the in-core inode. For fifos, devs, and sockets - * this means set if_rdev to the proper value. For files, directories, - * and symlinks this means to bring in the in-line data or extent - * pointers. For a file in B-tree format, only the root is immediately - * brought in-core. The rest will be in-lined in if_extents when it - * is first referenced (see xfs_iread_extents()). + * Copy inode type and data and attr format specific information from the + * on-disk inode to the in-core inode and fork structures. For fifos, devices, + * and sockets this means set i_rdev to the proper value. For files, + * directories, and symlinks this means to bring in the in-line data or extent + * pointers as well as the attribute fork. For a fork in B-tree format, only + * the root is immediately brought in-core. The rest will be read in later when + * first referenced (see xfs_iread_extents()). */ int xfs_iformat_fork( - xfs_inode_t *ip, - xfs_dinode_t *dip) + struct xfs_inode *ip, + struct xfs_dinode *dip) { - xfs_attr_shortform_t *atp; + struct inode *inode = VFS_I(ip); + struct xfs_attr_shortform *atp; int size; int error = 0; xfs_fsize_t di_size; @@ -95,8 +101,7 @@ xfs_iformat_fork( return -EFSCORRUPTED; } - if (unlikely(xfs_is_reflink_inode(ip) && - (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) { + if (unlikely(xfs_is_reflink_inode(ip) && !S_ISREG(inode->i_mode))) { xfs_warn(ip->i_mount, "corrupt dinode %llu, wrong file type for reflink.", ip->i_ino); @@ -115,7 +120,7 @@ xfs_iformat_fork( return -EFSCORRUPTED; } - switch (VFS_I(ip)->i_mode & S_IFMT) { + switch (inode->i_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: case S_IFBLK: @@ -126,7 +131,7 @@ xfs_iformat_fork( return -EFSCORRUPTED; } ip->i_d.di_size = 0; - ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); + inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); break; case S_IFREG: @@ -184,8 +189,7 @@ xfs_iformat_fork( return error; /* Check inline dir contents. */ - if (S_ISDIR(VFS_I(ip)->i_mode) && - dip->di_format == XFS_DINODE_FMT_LOCAL) { + if (S_ISDIR(inode->i_mode) && dip->di_format == XFS_DINODE_FMT_LOCAL) { error = xfs_dir2_sf_verify(ip); if (error) { xfs_idestroy_fork(ip, XFS_DATA_FORK); @@ -265,19 +269,14 @@ xfs_init_local_fork( if (zero_terminate) mem_size++; - if (size == 0) - ifp->if_u1.if_data = NULL; - else if (mem_size <= sizeof(ifp->if_u2.if_inline_data)) - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - else { + if (size) { real_size = roundup(mem_size, 4); ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } - - if (size) { memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; + } else { + ifp->if_u1.if_data = NULL; } ifp->if_bytes = size; @@ -288,13 +287,6 @@ xfs_init_local_fork( /* * The file is in-lined in the on-disk inode. - * If it fits into if_inline_data, then copy - * it there, otherwise allocate a buffer for it - * and copy the data there. Either way, set - * if_data to point at the data. - * If we allocate a buffer for the data, make - * sure that its size is a multiple of 4 and - * record the real size in i_real_bytes. */ STATIC int xfs_iformat_local( @@ -324,9 +316,7 @@ xfs_iformat_local( /* * The file consists of a set of extents all of which fit into the on-disk - * inode. If there are few enough extents to fit into the if_inline_ext, then - * copy them there. Otherwise allocate a buffer for them and copy them into it. - * Either way, set if_extents to point at the extents. + * inode. */ STATIC int xfs_iformat_extents( @@ -336,9 +326,12 @@ xfs_iformat_extents( { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int state = xfs_bmap_fork_to_state(whichfork); int nex = XFS_DFORK_NEXTENTS(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); + struct xfs_iext_cursor icur; struct xfs_bmbt_rec *dp; + struct xfs_bmbt_irec new; int i; /* @@ -354,27 +347,25 @@ xfs_iformat_extents( } ifp->if_real_bytes = 0; - if (nex == 0) - ifp->if_u1.if_extents = NULL; - else if (nex <= XFS_INLINE_EXTS) - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - else - xfs_iext_add(ifp, 0, nex); - - ifp->if_bytes = size; + ifp->if_bytes = 0; + ifp->if_u1.if_root = NULL; + ifp->if_height = 0; if (size) { dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); + + xfs_iext_first(ifp, &icur); for (i = 0; i < nex; i++, dp++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - ep->l0 = get_unaligned_be64(&dp->l0); - ep->l1 = get_unaligned_be64(&dp->l1); - if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) { + xfs_bmbt_disk_get_all(dp, &new); + if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) { XFS_ERROR_REPORT("xfs_iformat_extents(2)", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } + + xfs_iext_insert(ip, &icur, &new, state); + trace_xfs_read_extent(ip, &icur, state, _THIS_IP_); + xfs_iext_next(ifp, &icur); } - XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); } ifp->if_flags |= XFS_IFEXTENTS; return 0; @@ -440,47 +431,14 @@ xfs_iformat_btree( ifp->if_flags &= ~XFS_IFEXTENTS; ifp->if_flags |= XFS_IFBROOT; + ifp->if_real_bytes = 0; + ifp->if_bytes = 0; + ifp->if_u1.if_root = NULL; + ifp->if_height = 0; return 0; } /* - * Read in extents from a btree-format inode. - * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. - */ -int -xfs_iread_extents( - xfs_trans_t *tp, - xfs_inode_t *ip, - int whichfork) -{ - int error; - xfs_ifork_t *ifp; - xfs_extnum_t nextents; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { - XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, - ip->i_mount); - return -EFSCORRUPTED; - } - nextents = XFS_IFORK_NEXTENTS(ip, whichfork); - ifp = XFS_IFORK_PTR(ip, whichfork); - - /* - * We know that the size is valid (it's checked in iformat_btree) - */ - ifp->if_bytes = ifp->if_real_bytes = 0; - xfs_iext_add(ifp, 0, nextents); - error = xfs_bmap_read_extents(tp, ip, whichfork); - if (error) { - xfs_iext_destroy(ifp); - return error; - } - ifp->if_flags |= XFS_IFEXTENTS; - return 0; -} -/* * Reallocate the space for if_broot based on the number of records * being added or deleted as indicated in rec_diff. Move the records * and pointers in if_broot to fit the new size. When shrinking this @@ -644,26 +602,9 @@ xfs_idata_realloc( ASSERT(new_size >= 0); if (new_size == 0) { - if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - kmem_free(ifp->if_u1.if_data); - } + kmem_free(ifp->if_u1.if_data); ifp->if_u1.if_data = NULL; real_size = 0; - } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { - /* - * If the valid extents/data can fit in if_inline_ext/data, - * copy them from the malloc'd vector and free it. - */ - if (ifp->if_u1.if_data == NULL) { - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - ASSERT(ifp->if_real_bytes != 0); - memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, - new_size); - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } - real_size = 0; } else { /* * Stuck with malloc/realloc. @@ -677,7 +618,7 @@ xfs_idata_realloc( ASSERT(ifp->if_real_bytes == 0); ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + } else { /* * Only do the realloc if the underlying size * is really changing. @@ -688,12 +629,6 @@ xfs_idata_realloc( real_size, KM_SLEEP | KM_NOFS); } - } else { - ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, - KM_SLEEP | KM_NOFS); - memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, - ifp->if_bytes); } } ifp->if_real_bytes = real_size; @@ -721,23 +656,18 @@ xfs_idestroy_fork( * so check and free it up if we do. */ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && - (ifp->if_u1.if_data != NULL)) { + if (ifp->if_u1.if_data != NULL) { ASSERT(ifp->if_real_bytes != 0); kmem_free(ifp->if_u1.if_data); ifp->if_u1.if_data = NULL; ifp->if_real_bytes = 0; } - } else if ((ifp->if_flags & XFS_IFEXTENTS) && - ((ifp->if_flags & XFS_IFEXTIREC) || - ((ifp->if_u1.if_extents != NULL) && - (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { - ASSERT(ifp->if_real_bytes != 0); + } else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) { xfs_iext_destroy(ifp); } - ASSERT(ifp->if_u1.if_extents == NULL || - ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); + ASSERT(ifp->if_real_bytes == 0); + if (whichfork == XFS_ATTR_FORK) { kmem_zone_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; @@ -747,19 +677,9 @@ xfs_idestroy_fork( } } -/* Count number of incore extents based on if_bytes */ -xfs_extnum_t -xfs_iext_count(struct xfs_ifork *ifp) -{ - return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); -} - /* * Convert in-core extents to on-disk form * - * For either the data or attr fork in extent format, we need to endian convert - * the in-core extent as we place them into the on-disk inode. - * * In the case of the data fork, the in-core and on-disk fork sizes can be * different due to delayed allocation extents. We only copy on-disk extents * here, so callers must always use the physical fork size to determine the @@ -768,53 +688,32 @@ xfs_iext_count(struct xfs_ifork *ifp) */ int xfs_iextents_copy( - xfs_inode_t *ip, - xfs_bmbt_rec_t *dp, + struct xfs_inode *ip, + struct xfs_bmbt_rec *dp, int whichfork) { - int copied; - int i; - xfs_ifork_t *ifp; - int nrecs; - xfs_fsblock_t start_block; + int state = xfs_bmap_fork_to_state(whichfork); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec rec; + int copied = 0; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); ASSERT(ifp->if_bytes > 0); - nrecs = xfs_iext_count(ifp); - XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); - ASSERT(nrecs > 0); - - /* - * There are some delayed allocation extents in the - * inode, so copy the extents one at a time and skip - * the delayed ones. There must be at least one - * non-delayed extent. - */ - copied = 0; - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - - ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep)); - - start_block = xfs_bmbt_get_startblock(ep); - if (isnullstartblock(start_block)) { - /* - * It's a delayed allocation extent, so skip it. - */ + for_each_xfs_iext(ifp, &icur, &rec) { + if (isnullstartblock(rec.br_startblock)) continue; - } - - /* Translate to on disk format */ - put_unaligned_be64(ep->l0, &dp->l0); - put_unaligned_be64(ep->l1, &dp->l1); + ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, &rec)); + xfs_bmbt_disk_set_all(dp, &rec); + trace_xfs_write_extent(ip, &icur, state, _RET_IP_); + copied += sizeof(struct xfs_bmbt_rec); dp++; - copied++; } - ASSERT(copied != 0); - return (copied * (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(copied > 0); + ASSERT(copied <= ifp->if_bytes); + return copied; } /* @@ -872,7 +771,6 @@ xfs_iflush_fork( !(iip->ili_fields & extflag[whichfork])); if ((iip->ili_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(xfs_iext_get_ext(ifp, 0)); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, whichfork); @@ -894,16 +792,7 @@ xfs_iflush_fork( case XFS_DINODE_FMT_DEV: if (iip->ili_fields & XFS_ILOG_DEV) { ASSERT(whichfork == XFS_DATA_FORK); - xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); - } - break; - - case XFS_DINODE_FMT_UUID: - if (iip->ili_fields & XFS_ILOG_UUID) { - ASSERT(whichfork == XFS_DATA_FORK); - memcpy(XFS_DFORK_DPTR(dip), - &ip->i_df.if_u2.if_uuid, - sizeof(uuid_t)); + xfs_dinode_put_rdev(dip, sysv_encode_dev(VFS_I(ip)->i_rdev)); } break; @@ -913,33 +802,6 @@ xfs_iflush_fork( } } -/* - * Return a pointer to the extent record at file index idx. - */ -xfs_bmbt_rec_host_t * -xfs_iext_get_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx) /* index of target extent */ -{ - ASSERT(idx >= 0); - ASSERT(idx < xfs_iext_count(ifp)); - - if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { - return ifp->if_u1.if_ext_irec->er_extbuf; - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_ext_irec_t *erp; /* irec pointer */ - int erp_idx = 0; /* irec index */ - xfs_extnum_t page_idx = idx; /* ext index in target list */ - - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - return &erp->er_extbuf[page_idx]; - } else if (ifp->if_bytes) { - return &ifp->if_u1.if_extents[idx]; - } else { - return NULL; - } -} - /* Convert bmap state flags to an inode fork. */ struct xfs_ifork * xfs_iext_state_to_fork( @@ -954,1011 +816,6 @@ xfs_iext_state_to_fork( } /* - * Insert new item(s) into the extent records for incore inode - * fork 'ifp'. 'count' new items are inserted at index 'idx'. - */ -void -xfs_iext_insert( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* starting index of new items */ - xfs_extnum_t count, /* number of inserted items */ - xfs_bmbt_irec_t *new, /* items to insert */ - int state) /* type of extent conversion */ -{ - xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); - xfs_extnum_t i; /* extent record index */ - - trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); - - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - xfs_iext_add(ifp, idx, count); - for (i = idx; i < idx + count; i++, new++) - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be increased. The ext_diff parameter stores the - * number of new extents being added and the idx parameter contains - * the extent index where the new extents will be added. If the new - * extents are being appended, then we just need to (re)allocate and - * initialize the space. Otherwise, if the new extents are being - * inserted into the middle of the existing entries, a bit more work - * is required to make room for the new extents to be inserted. The - * caller is responsible for filling in the new extent entries upon - * return. - */ -void -xfs_iext_add( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin adding exts */ - int ext_diff) /* number of extents to add */ -{ - int byte_diff; /* new bytes being added */ - int new_size; /* size of extents after adding */ - xfs_extnum_t nextents; /* number of extents in file */ - - nextents = xfs_iext_count(ifp); - ASSERT((idx >= 0) && (idx <= nextents)); - byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); - new_size = ifp->if_bytes + byte_diff; - /* - * If the new number of extents (nextents + ext_diff) - * fits inside the inode, then continue to use the inline - * extent buffer. - */ - if (nextents + ext_diff <= XFS_INLINE_EXTS) { - if (idx < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], - &ifp->if_u2.if_inline_ext[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); - } - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; - } - /* - * Otherwise use a linear (direct) extent list. - * If the extents are currently inside the inode, - * xfs_iext_realloc_direct will switch us from - * inline to direct extent allocation mode. - */ - else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, new_size); - if (idx < nextents) { - memmove(&ifp->if_u1.if_extents[idx + ext_diff], - &ifp->if_u1.if_extents[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); - } - } - /* Indirection array */ - else { - xfs_ext_irec_t *erp; - int erp_idx = 0; - int page_idx = idx; - - ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); - if (ifp->if_flags & XFS_IFEXTIREC) { - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); - } else { - xfs_iext_irec_init(ifp); - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = ifp->if_u1.if_ext_irec; - } - /* Extents fit in target extent page */ - if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { - if (page_idx < erp->er_extcount) { - memmove(&erp->er_extbuf[page_idx + ext_diff], - &erp->er_extbuf[page_idx], - (erp->er_extcount - page_idx) * - sizeof(xfs_bmbt_rec_t)); - memset(&erp->er_extbuf[page_idx], 0, byte_diff); - } - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - } - /* Insert a new extent page */ - else if (erp) { - xfs_iext_add_indirect_multi(ifp, - erp_idx, page_idx, ext_diff); - } - /* - * If extent(s) are being appended to the last page in - * the indirection array and the new extent(s) don't fit - * in the page, then erp is NULL and erp_idx is set to - * the next index needed in the indirection array. - */ - else { - uint count = ext_diff; - - while (count) { - erp = xfs_iext_irec_new(ifp, erp_idx); - erp->er_extcount = min(count, XFS_LINEAR_EXTS); - count -= erp->er_extcount; - if (count) - erp_idx++; - } - } - } - ifp->if_bytes = new_size; -} - -/* - * This is called when incore extents are being added to the indirection - * array and the new extents do not fit in the target extent list. The - * erp_idx parameter contains the irec index for the target extent list - * in the indirection array, and the idx parameter contains the extent - * index within the list. The number of extents being added is stored - * in the count parameter. - * - * |-------| |-------| - * | | | | idx - number of extents before idx - * | idx | | count | - * | | | | count - number of extents being inserted at idx - * |-------| |-------| - * | count | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_add_indirect_multi( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* target extent irec index */ - xfs_extnum_t idx, /* index within target list */ - int count) /* new extents being added */ -{ - int byte_diff; /* new bytes being added */ - xfs_ext_irec_t *erp; /* pointer to irec entry */ - xfs_extnum_t ext_diff; /* number of extents to add */ - xfs_extnum_t ext_cnt; /* new extents still needed */ - xfs_extnum_t nex2; /* extents after idx + count */ - xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ - int nlists; /* number of irec's (lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex2 = erp->er_extcount - idx; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* - * Save second part of target extent list - * (all extents past */ - if (nex2) { - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); - memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); - erp->er_extcount -= nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); - memset(&erp->er_extbuf[idx], 0, byte_diff); - } - - /* - * Add the new extents to the end of the target - * list, then allocate new irec record(s) and - * extent buffer(s) as needed to store the rest - * of the new extents. - */ - ext_cnt = count; - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); - if (ext_diff) { - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - while (ext_cnt) { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); - erp->er_extcount = ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - - /* Add nex2 extents back to indirection array */ - if (nex2) { - xfs_extnum_t ext_avail; - int i; - - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - i = 0; - /* - * If nex2 extents fit in the current page, append - * nex2_ep after the new extents. - */ - if (nex2 <= ext_avail) { - i = erp->er_extcount; - } - /* - * Otherwise, check if space is available in the - * next page. - */ - else if ((erp_idx < nlists - 1) && - (nex2 <= (ext_avail = XFS_LINEAR_EXTS - - ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { - erp_idx++; - erp++; - /* Create a hole for nex2 extents */ - memmove(&erp->er_extbuf[nex2], erp->er_extbuf, - erp->er_extcount * sizeof(xfs_bmbt_rec_t)); - } - /* - * Final choice, create a new extent page for - * nex2 extents. - */ - else { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - } - memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); - kmem_free(nex2_ep); - erp->er_extcount += nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); - } -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be decreased. The ext_diff parameter stores the - * number of extents to be removed and the idx parameter contains - * the extent index where the extents will be removed from. - * - * If the amount of space needed has decreased below the linear - * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous - * extent array. Otherwise, use kmem_realloc() to adjust the - * size to what is needed. - */ -void -xfs_iext_remove( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff, /* number of extents to remove */ - int state) /* type of extent conversion */ -{ - xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - trace_xfs_iext_remove(ip, idx, state, _RET_IP_); - - ASSERT(ext_diff > 0); - nextents = xfs_iext_count(ifp); - new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_iext_remove_indirect(ifp, idx, ext_diff); - } else if (ifp->if_real_bytes) { - xfs_iext_remove_direct(ifp, idx, ext_diff); - } else { - xfs_iext_remove_inline(ifp, idx, ext_diff); - } - ifp->if_bytes = new_size; -} - -/* - * This removes ext_diff extents from the inline buffer, beginning - * at extent index idx. - */ -void -xfs_iext_remove_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - int nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - ASSERT(idx < XFS_INLINE_EXTS); - nextents = xfs_iext_count(ifp); - ASSERT(((nextents - ext_diff) > 0) && - (nextents - ext_diff) < XFS_INLINE_EXTS); - - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx], - &ifp->if_u2.if_inline_ext[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - } else { - memset(&ifp->if_u2.if_inline_ext[idx], 0, - ext_diff * sizeof(xfs_bmbt_rec_t)); - } -} - -/* - * This removes ext_diff extents from a linear (direct) extent list, - * beginning at extent index idx. If the extents are being removed - * from the end of the list (ie. truncate) then we just need to re- - * allocate the list to remove the extra space. Otherwise, if the - * extents are being removed from the middle of the existing extent - * entries, then we first need to move the extent records beginning - * at idx + ext_diff up in the list to overwrite the records being - * removed, then remove the extra space via kmem_realloc. - */ -void -xfs_iext_remove_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - new_size = ifp->if_bytes - - (ext_diff * sizeof(xfs_bmbt_rec_t)); - nextents = xfs_iext_count(ifp); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - return; - } - /* Move extents up in the list (if needed) */ - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u1.if_extents[idx], - &ifp->if_u1.if_extents[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - } - memset(&ifp->if_u1.if_extents[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - /* - * Reallocate the direct extent list. If the extents - * will fit inside the inode then xfs_iext_realloc_direct - * will switch from direct to inline extent allocation - * mode for us. - */ - xfs_iext_realloc_direct(ifp, new_size); - ifp->if_bytes = new_size; -} - -/* - * This is called when incore extents are being removed from the - * indirection array and the extents being removed span multiple extent - * buffers. The idx parameter contains the file extent index where we - * want to begin removing extents, and the count parameter contains - * how many extents need to be removed. - * - * |-------| |-------| - * | nex1 | | | nex1 - number of extents before idx - * |-------| | count | - * | | | | count - number of extents being removed at idx - * | count | |-------| - * | | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_remove_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing extents */ - int count) /* number of extents to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int erp_idx = 0; /* indirection array index */ - xfs_extnum_t ext_cnt; /* extents left to remove */ - xfs_extnum_t ext_diff; /* extents to remove in current list */ - xfs_extnum_t nex1; /* number of extents before idx */ - xfs_extnum_t nex2; /* extents after idx + count */ - int page_idx = idx; /* index in target extent list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - ASSERT(erp != NULL); - nex1 = page_idx; - ext_cnt = count; - while (ext_cnt) { - nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); - ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); - /* - * Check for deletion of entire list; - * xfs_iext_irec_remove() updates extent offsets. - */ - if (ext_diff == erp->er_extcount) { - xfs_iext_irec_remove(ifp, erp_idx); - ext_cnt -= ext_diff; - nex1 = 0; - if (ext_cnt) { - ASSERT(erp_idx < ifp->if_real_bytes / - XFS_IEXT_BUFSZ); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex1 = 0; - continue; - } else { - break; - } - } - /* Move extents up (if needed) */ - if (nex2) { - memmove(&erp->er_extbuf[nex1], - &erp->er_extbuf[nex1 + ext_diff], - nex2 * sizeof(xfs_bmbt_rec_t)); - } - /* Zero out rest of page */ - memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - - ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); - /* Update remaining counters */ - erp->er_extcount -= ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); - ext_cnt -= ext_diff; - nex1 = 0; - erp_idx++; - erp++; - } - ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact(ifp); -} - -/* - * Create, destroy, or resize a linear (direct) block of extents. - */ -void -xfs_iext_realloc_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new size of extents after adding */ -{ - int rnew_size; /* real new size of extents */ - - rnew_size = new_size; - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || - ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && - (new_size != ifp->if_real_bytes))); - - /* Free extent records */ - if (new_size == 0) { - xfs_iext_destroy(ifp); - } - /* Resize direct extent list and zero any new bytes */ - else if (ifp->if_real_bytes) { - /* Check if extents will fit inside the inode */ - if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { - xfs_iext_direct_to_inline(ifp, new_size / - (uint)sizeof(xfs_bmbt_rec_t)); - ifp->if_bytes = new_size; - return; - } - if (!is_power_of_2(new_size)){ - rnew_size = roundup_pow_of_two(new_size); - } - if (rnew_size != ifp->if_real_bytes) { - ifp->if_u1.if_extents = - kmem_realloc(ifp->if_u1.if_extents, - rnew_size, KM_NOFS); - } - if (rnew_size > ifp->if_real_bytes) { - memset(&ifp->if_u1.if_extents[ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)], 0, - rnew_size - ifp->if_real_bytes); - } - } - /* Switch from the inline extent buffer to a direct extent list */ - else { - if (!is_power_of_2(new_size)) { - rnew_size = roundup_pow_of_two(new_size); - } - xfs_iext_inline_to_direct(ifp, rnew_size); - } - ifp->if_real_bytes = rnew_size; - ifp->if_bytes = new_size; -} - -/* - * Switch from linear (direct) extent records to inline buffer. - */ -void -xfs_iext_direct_to_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t nextents) /* number of extents in file */ -{ - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(nextents <= XFS_INLINE_EXTS); - /* - * The inline buffer was zeroed when we switched - * from inline to direct extent allocation mode, - * so we don't need to clear it here. - */ - memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, - nextents * sizeof(xfs_bmbt_rec_t)); - kmem_free(ifp->if_u1.if_extents); - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; -} - -/* - * Switch from inline buffer to linear (direct) extent records. - * new_size should already be rounded up to the next power of 2 - * by the caller (when appropriate), so use new_size as it is. - * However, since new_size may be rounded up, we can't update - * if_bytes here. It is the caller's responsibility to update - * if_bytes upon return. - */ -void -xfs_iext_inline_to_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* number of extents in file */ -{ - ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); - memset(ifp->if_u1.if_extents, 0, new_size); - if (ifp->if_bytes) { - memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, - ifp->if_bytes); - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_real_bytes = new_size; -} - -/* - * Resize an extent indirection array to new_size bytes. - */ -STATIC void -xfs_iext_realloc_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new indirection array size */ -{ - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && - (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) * - sizeof(xfs_ext_irec_t)))); - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else { - ifp->if_u1.if_ext_irec = - kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS); - } -} - -/* - * Switch from indirection array to linear (direct) extent allocations. - */ -STATIC void -xfs_iext_indirect_to_direct( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - int size; /* size of file extents */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nextents = xfs_iext_count(ifp); - ASSERT(nextents <= XFS_LINEAR_EXTS); - size = nextents * sizeof(xfs_bmbt_rec_t); - - xfs_iext_irec_compact_pages(ifp); - ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); - - ep = ifp->if_u1.if_ext_irec->er_extbuf; - kmem_free(ifp->if_u1.if_ext_irec); - ifp->if_flags &= ~XFS_IFEXTIREC; - ifp->if_u1.if_extents = ep; - ifp->if_bytes = size; - if (nextents < XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, size); - } -} - -/* - * Remove all records from the indirection array. - */ -STATIC void -xfs_iext_irec_remove_all( - struct xfs_ifork *ifp) -{ - int nlists; - int i; - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (i = 0; i < nlists; i++) - kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf); - kmem_free(ifp->if_u1.if_ext_irec); - ifp->if_flags &= ~XFS_IFEXTIREC; -} - -/* - * Free incore file extents. - */ -void -xfs_iext_destroy( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_iext_irec_remove_all(ifp); - } else if (ifp->if_real_bytes) { - kmem_free(ifp->if_u1.if_extents); - } else if (ifp->if_bytes) { - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_u1.if_extents = NULL; - ifp->if_real_bytes = 0; - ifp->if_bytes = 0; -} - -/* - * Return a pointer to the extent record for file system block bno. - */ -xfs_bmbt_rec_host_t * /* pointer to found extent record */ -xfs_iext_bno_to_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - xfs_extnum_t *idxp) /* index of target extent */ -{ - xfs_bmbt_rec_host_t *base; /* pointer to first extent */ - xfs_filblks_t blockcount = 0; /* number of blocks in extent */ - xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - int high; /* upper boundary in search */ - xfs_extnum_t idx = 0; /* index of target extent */ - int low; /* lower boundary in search */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff = 0; /* start offset of extent */ - - nextents = xfs_iext_count(ifp); - if (nextents == 0) { - *idxp = 0; - return NULL; - } - low = 0; - if (ifp->if_flags & XFS_IFEXTIREC) { - /* Find target extent list */ - int erp_idx = 0; - erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); - base = erp->er_extbuf; - high = erp->er_extcount - 1; - } else { - base = ifp->if_u1.if_extents; - high = nextents - 1; - } - /* Binary search extent records */ - while (low <= high) { - idx = (low + high) >> 1; - ep = base + idx; - startoff = xfs_bmbt_get_startoff(ep); - blockcount = xfs_bmbt_get_blockcount(ep); - if (bno < startoff) { - high = idx - 1; - } else if (bno >= startoff + blockcount) { - low = idx + 1; - } else { - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - *idxp = idx; - return ep; - } - } - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - if (bno >= startoff + blockcount) { - if (++idx == nextents) { - ep = NULL; - } else { - ep = xfs_iext_get_ext(ifp, idx); - } - } - *idxp = idx; - return ep; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record for filesystem block bno. Store the index of the - * target irec in *erp_idxp. - */ -xfs_ext_irec_t * /* pointer to found extent record */ -xfs_iext_bno_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - int *erp_idxp) /* irec index of target ext list */ -{ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - xfs_ext_irec_t *erp_next; /* next indirection array entry */ - int erp_idx; /* indirection array index */ - int nlists; /* number of extent irec's (lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; - if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { - high = erp_idx - 1; - } else if (erp_next && bno >= - xfs_bmbt_get_startoff(erp_next->er_extbuf)) { - low = erp_idx + 1; - } else { - break; - } - } - *erp_idxp = erp_idx; - return erp; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record at file extent index *idxp. Store the index of the - * target irec in *erp_idxp and store the page index of the target - * extent record in *idxp. - */ -xfs_ext_irec_t * -xfs_iext_idx_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t *idxp, /* extent index (file -> page) */ - int *erp_idxp, /* pointer to target irec */ - int realloc) /* new bytes were just added */ -{ - xfs_ext_irec_t *prev; /* pointer to previous irec */ - xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ - int erp_idx; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - xfs_extnum_t page_idx = *idxp; /* extent index in target list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - ASSERT(page_idx >= 0); - ASSERT(page_idx <= xfs_iext_count(ifp)); - ASSERT(page_idx < xfs_iext_count(ifp) || realloc); - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - - /* Binary search extent irec's */ - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - prev = erp_idx > 0 ? erp - 1 : NULL; - if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && - realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { - high = erp_idx - 1; - } else if (page_idx > erp->er_extoff + erp->er_extcount || - (page_idx == erp->er_extoff + erp->er_extcount && - !realloc)) { - low = erp_idx + 1; - } else if (page_idx == erp->er_extoff + erp->er_extcount && - erp->er_extcount == XFS_LINEAR_EXTS) { - ASSERT(realloc); - page_idx = 0; - erp_idx++; - erp = erp_idx < nlists ? erp + 1 : NULL; - break; - } else { - page_idx -= erp->er_extoff; - break; - } - } - *idxp = page_idx; - *erp_idxp = erp_idx; - return erp; -} - -/* - * Allocate and initialize an indirection array once the space needed - * for incore extents increases above XFS_IEXT_BUFSZ. - */ -void -xfs_iext_irec_init( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - nextents = xfs_iext_count(ifp); - ASSERT(nextents <= XFS_LINEAR_EXTS); - - erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); - - if (nextents == 0) { - ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); - } else if (!ifp->if_real_bytes) { - xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); - } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { - xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); - } - erp->er_extbuf = ifp->if_u1.if_extents; - erp->er_extcount = nextents; - erp->er_extoff = 0; - - ifp->if_flags |= XFS_IFEXTIREC; - ifp->if_real_bytes = XFS_IEXT_BUFSZ; - ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); - ifp->if_u1.if_ext_irec = erp; - - return; -} - -/* - * Allocate and initialize a new entry in the indirection array. - */ -xfs_ext_irec_t * -xfs_iext_irec_new( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* index for new irec */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* Resize indirection array */ - xfs_iext_realloc_indirect(ifp, ++nlists * - sizeof(xfs_ext_irec_t)); - /* - * Move records down in the array so the - * new page can use erp_idx. - */ - erp = ifp->if_u1.if_ext_irec; - for (i = nlists - 1; i > erp_idx; i--) { - memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); - } - ASSERT(i == erp_idx); - - /* Initialize new extent record */ - erp = ifp->if_u1.if_ext_irec; - erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; - memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); - erp[erp_idx].er_extcount = 0; - erp[erp_idx].er_extoff = erp_idx > 0 ? - erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; - return (&erp[erp_idx]); -} - -/* - * Remove a record from the indirection array. - */ -void -xfs_iext_irec_remove( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* irec index to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - if (erp->er_extbuf) { - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, - -erp->er_extcount); - kmem_free(erp->er_extbuf); - } - /* Compact extent records */ - erp = ifp->if_u1.if_ext_irec; - for (i = erp_idx; i < nlists - 1; i++) { - memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); - } - /* - * Manually free the last extent record from the indirection - * array. A call to xfs_iext_realloc_indirect() with a size - * of zero would result in a call to xfs_iext_destroy() which - * would in turn call this function again, creating a nasty - * infinite loop. - */ - if (--nlists) { - xfs_iext_realloc_indirect(ifp, - nlists * sizeof(xfs_ext_irec_t)); - } else { - kmem_free(ifp->if_u1.if_ext_irec); - } - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; -} - -/* - * This is called to clean up large amounts of unused memory allocated - * by the indirection array. Before compacting anything though, verify - * that the indirection array is still needed and switch back to the - * linear extent list (or even the inline buffer) if possible. The - * compaction policy is as follows: - * - * Full Compaction: Extents fit into a single page (or inline buffer) - * Partial Compaction: Extents occupy less than 50% of allocated space - * No Compaction: Extents occupy at least 50% of allocated space - */ -void -xfs_iext_irec_compact( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nextents = xfs_iext_count(ifp); - - if (nextents == 0) { - xfs_iext_destroy(ifp); - } else if (nextents <= XFS_INLINE_EXTS) { - xfs_iext_indirect_to_direct(ifp); - xfs_iext_direct_to_inline(ifp, nextents); - } else if (nextents <= XFS_LINEAR_EXTS) { - xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { - xfs_iext_irec_compact_pages(ifp); - } -} - -/* - * Combine extents from neighboring extent pages. - */ -void -xfs_iext_irec_compact_pages( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ - int erp_idx = 0; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - while (erp_idx < nlists - 1) { - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp + 1; - if (erp_next->er_extcount <= - (XFS_LINEAR_EXTS - erp->er_extcount)) { - memcpy(&erp->er_extbuf[erp->er_extcount], - erp_next->er_extbuf, erp_next->er_extcount * - sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += erp_next->er_extcount; - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - } else { - erp_idx++; - } - } -} - -/* - * This is called to update the er_extoff field in the indirection - * array when extents have been added or removed from one of the - * extent lists. erp_idx contains the irec index to begin updating - * at and ext_diff contains the number of extents that were added - * or removed. - */ -void -xfs_iext_irec_update_extoffs( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* irec index to update */ - int ext_diff) /* number of new extents */ -{ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (i = erp_idx; i < nlists; i++) { - ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; - } -} - -/* * Initialize an inode's copy-on-write fork. */ void @@ -1974,61 +831,3 @@ xfs_ifork_init_cow( ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; } - -/* - * Lookup the extent covering bno. - * - * If there is an extent covering bno return the extent index, and store the - * expanded extent structure in *gotp, and the extent index in *idx. - * If there is no extent covering bno, but there is an extent after it (e.g. - * it lies in a hole) return that extent in *gotp and its index in *idx - * instead. - * If bno is beyond the last extent return false, and return the index after - * the last valid index in *idxp. - */ -bool -xfs_iext_lookup_extent( - struct xfs_inode *ip, - struct xfs_ifork *ifp, - xfs_fileoff_t bno, - xfs_extnum_t *idxp, - struct xfs_bmbt_irec *gotp) -{ - struct xfs_bmbt_rec_host *ep; - - XFS_STATS_INC(ip->i_mount, xs_look_exlist); - - ep = xfs_iext_bno_to_ext(ifp, bno, idxp); - if (!ep) - return false; - xfs_bmbt_get_all(ep, gotp); - return true; -} - -/* - * Return true if there is an extent at index idx, and return the expanded - * extent structure at idx in that case. Else return false. - */ -bool -xfs_iext_get_extent( - struct xfs_ifork *ifp, - xfs_extnum_t idx, - struct xfs_bmbt_irec *gotp) -{ - if (idx < 0 || idx >= xfs_iext_count(ifp)) - return false; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp); - return true; -} - -void -xfs_iext_update_extent( - struct xfs_ifork *ifp, - xfs_extnum_t idx, - struct xfs_bmbt_irec *gotp) -{ - ASSERT(idx >= 0); - ASSERT(idx < xfs_iext_count(ifp)); - - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, idx), gotp); -} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 11af705219f6..b9f0098e33b8 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -22,56 +22,19 @@ struct xfs_inode_log_item; struct xfs_dinode; /* - * The following xfs_ext_irec_t struct introduces a second (top) level - * to the in-core extent allocation scheme. These structs are allocated - * in a contiguous block, creating an indirection array where each entry - * (irec) contains a pointer to a buffer of in-core extent records which - * it manages. Each extent buffer is 4k in size, since 4k is the system - * page size on Linux i386 and systems with larger page sizes don't seem - * to gain much, if anything, by using their native page size as the - * extent buffer size. Also, using 4k extent buffers everywhere provides - * a consistent interface for CXFS across different platforms. - * - * There is currently no limit on the number of irec's (extent lists) - * allowed, so heavily fragmented files may require an indirection array - * which spans multiple system pages of memory. The number of extents - * which would require this amount of contiguous memory is very large - * and should not cause problems in the foreseeable future. However, - * if the memory needed for the contiguous array ever becomes a problem, - * it is possible that a third level of indirection may be required. - */ -typedef struct xfs_ext_irec { - xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ - xfs_extnum_t er_extoff; /* extent offset in file */ - xfs_extnum_t er_extcount; /* number of extents in page/block */ -} xfs_ext_irec_t; - -/* * File incore extent information, present for each of data & attr forks. */ -#define XFS_IEXT_BUFSZ 4096 -#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) -#define XFS_INLINE_EXTS 2 -#define XFS_INLINE_DATA 32 typedef struct xfs_ifork { int if_bytes; /* bytes in if_u1 */ int if_real_bytes; /* bytes allocated in if_u1 */ struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ + int if_height; /* height of the extent tree */ union { - xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ - xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ + void *if_root; /* extent tree root */ char *if_data; /* inline file data */ } if_u1; - union { - xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; - /* very small file extents */ - char if_inline_data[XFS_INLINE_DATA]; - /* very small file data */ - xfs_dev_t if_rdev; /* dev number if special */ - uuid_t if_uuid; /* mount point value */ - } if_u2; } xfs_ifork_t; /* @@ -80,7 +43,6 @@ typedef struct xfs_ifork { #define XFS_IFINLINE 0x01 /* Inline data is read in */ #define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ #define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ -#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ /* * Fork handling. @@ -150,45 +112,75 @@ int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); -struct xfs_bmbt_rec_host * - xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); -xfs_extnum_t xfs_iext_count(struct xfs_ifork *); -void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, - struct xfs_bmbt_irec *, int); -void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); -void xfs_iext_add_indirect_multi(struct xfs_ifork *, int, - xfs_extnum_t, int); -void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int); -void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int); -void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int); -void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int); -void xfs_iext_realloc_direct(struct xfs_ifork *, int); -void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t); -void xfs_iext_inline_to_direct(struct xfs_ifork *, int); +xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp); +void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *, int); +void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *, + int); void xfs_iext_destroy(struct xfs_ifork *); -struct xfs_bmbt_rec_host * - xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *); -struct xfs_ext_irec * - xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *); -struct xfs_ext_irec * - xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *, - int); -void xfs_iext_irec_init(struct xfs_ifork *); -struct xfs_ext_irec * - xfs_iext_irec_new(struct xfs_ifork *, int); -void xfs_iext_irec_remove(struct xfs_ifork *, int); -void xfs_iext_irec_compact(struct xfs_ifork *); -void xfs_iext_irec_compact_pages(struct xfs_ifork *); -void xfs_iext_irec_compact_full(struct xfs_ifork *); -void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); bool xfs_iext_lookup_extent(struct xfs_inode *ip, struct xfs_ifork *ifp, xfs_fileoff_t bno, - xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp); -bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp); -void xfs_iext_update_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, +bool xfs_iext_lookup_extent_before(struct xfs_inode *ip, + struct xfs_ifork *ifp, xfs_fileoff_t *end, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp); +bool xfs_iext_get_extent(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *gotp); +void xfs_iext_update_extent(struct xfs_inode *ip, int state, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *gotp); + +void xfs_iext_first(struct xfs_ifork *, struct xfs_iext_cursor *); +void xfs_iext_last(struct xfs_ifork *, struct xfs_iext_cursor *); +void xfs_iext_next(struct xfs_ifork *, struct xfs_iext_cursor *); +void xfs_iext_prev(struct xfs_ifork *, struct xfs_iext_cursor *); + +static inline bool xfs_iext_next_extent(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp) +{ + xfs_iext_next(ifp, cur); + return xfs_iext_get_extent(ifp, cur, gotp); +} + +static inline bool xfs_iext_prev_extent(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp) +{ + xfs_iext_prev(ifp, cur); + return xfs_iext_get_extent(ifp, cur, gotp); +} + +/* + * Return the extent after cur in gotp without updating the cursor. + */ +static inline bool xfs_iext_peek_next_extent(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp) +{ + struct xfs_iext_cursor ncur = *cur; + + xfs_iext_next(ifp, &ncur); + return xfs_iext_get_extent(ifp, &ncur, gotp); +} + +/* + * Return the extent before cur in gotp without updating the cursor. + */ +static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp) +{ + struct xfs_iext_cursor ncur = *cur; + + xfs_iext_prev(ifp, &ncur); + return xfs_iext_get_extent(ifp, &ncur, gotp); +} + +#define for_each_xfs_iext(ifp, ext, got) \ + for (xfs_iext_first((ifp), (ext)); \ + xfs_iext_get_extent((ifp), (ext), (got)); \ + xfs_iext_next((ifp), (ext))) extern struct kmem_zone *xfs_ifork_zone; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 8372e9bcd7b6..996f035ee205 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -264,54 +264,43 @@ typedef struct xfs_trans_header { * (if any) is indicated in the ilf_dsize field. Changes to this structure * must be added on to the end. */ -typedef struct xfs_inode_log_format { - uint16_t ilf_type; /* inode log item type */ - uint16_t ilf_size; /* size of this item */ - uint32_t ilf_fields; /* flags for fields logged */ - uint16_t ilf_asize; /* size of attr d/ext/root */ - uint16_t ilf_dsize; /* size of data/ext/root */ - uint64_t ilf_ino; /* inode number */ - union { - uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - int64_t ilf_blkno; /* blkno of inode buffer */ - int32_t ilf_len; /* len of inode buffer */ - int32_t ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_t; - -typedef struct xfs_inode_log_format_32 { +struct xfs_inode_log_format { uint16_t ilf_type; /* inode log item type */ uint16_t ilf_size; /* size of this item */ uint32_t ilf_fields; /* flags for fields logged */ uint16_t ilf_asize; /* size of attr d/ext/root */ uint16_t ilf_dsize; /* size of data/ext/root */ + uint32_t ilf_pad; /* pad for 64 bit boundary */ uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ + u8 __pad[16]; /* unused */ } ilf_u; int64_t ilf_blkno; /* blkno of inode buffer */ int32_t ilf_len; /* len of inode buffer */ int32_t ilf_boffset; /* off of inode in buffer */ -} __attribute__((packed)) xfs_inode_log_format_32_t; +}; -typedef struct xfs_inode_log_format_64 { +/* + * Old 32 bit systems will log in this format without the 64 bit + * alignment padding. Recovery will detect this and convert it to the + * correct format. + */ +struct xfs_inode_log_format_32 { uint16_t ilf_type; /* inode log item type */ uint16_t ilf_size; /* size of this item */ uint32_t ilf_fields; /* flags for fields logged */ uint16_t ilf_asize; /* size of attr d/ext/root */ uint16_t ilf_dsize; /* size of data/ext/root */ - uint32_t ilf_pad; /* pad for 64 bit boundary */ uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ + u8 __pad[16]; /* unused */ } ilf_u; int64_t ilf_blkno; /* blkno of inode buffer */ int32_t ilf_len; /* len of inode buffer */ int32_t ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_64_t; +} __attribute__((packed)); /* @@ -322,7 +311,7 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ #define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ #define XFS_ILOG_DEV 0x010 /* log the dev field */ -#define XFS_ILOG_UUID 0x020 /* log the uuid field */ +#define XFS_ILOG_UUID 0x020 /* added long ago, but never used */ #define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ #define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ #define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ @@ -340,9 +329,9 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ - XFS_ILOG_UUID | XFS_ILOG_ADATA | \ - XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ - XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \ + XFS_ILOG_AOWNER) #define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT) @@ -352,10 +341,10 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ - XFS_ILOG_DEV | XFS_ILOG_UUID | \ - XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \ - XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + XFS_ILOG_DEV | XFS_ILOG_ADATA | \ + XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ + XFS_ILOG_TIMESTAMP | XFS_ILOG_DOWNER | \ + XFS_ILOG_AOWNER) static inline int xfs_ilog_fbroot(int w) { diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 9d5406b4f663..585b35d34142 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -30,6 +30,7 @@ #include "xfs_bmap.h" #include "xfs_refcount_btree.h" #include "xfs_alloc.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 55c88a732690..dd019cee1b3b 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -34,6 +34,7 @@ #include "xfs_rmap_btree.h" #include "xfs_trans_space.h" #include "xfs_trace.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_extent_busy.h" #include "xfs_bmap.h" diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5d4e43ef4eea..3fb29a5ea915 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -672,7 +672,6 @@ xfs_rtmodify_range( /* * Compute a mask of relevant bits. */ - bit = 0; mask = ((xfs_rtword_t)1 << lastbit) - 1; /* * Set/clear the active bits. @@ -1086,3 +1085,15 @@ xfs_rtalloc_query_all( return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); } + +/* + * Verify that an realtime block number pointer doesn't point off the + * end of the realtime device. + */ +bool +xfs_verify_rtbno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return rtbno < mp->m_sb.sb_rblocks; +} diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 0220159bd463..3c560695c546 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -48,6 +48,12 @@ typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ typedef int64_t xfs_sfiloff_t; /* signed block number in a file */ /* + * New verifiers will return the instruction address of the failing check. + * NULL means everything is ok. + */ +typedef void * xfs_failaddr_t; + +/* * Null values for the types. */ #define NULLFSBLOCK ((xfs_fsblock_t)-1) @@ -136,5 +142,21 @@ typedef uint32_t xfs_dqid_t; #define XFS_NBWORD (1 << XFS_NBWORDLOG) #define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) +struct xfs_iext_cursor { + struct xfs_iext_leaf *leaf; + int pos; +}; + +typedef enum { + XFS_EXT_NORM, XFS_EXT_UNWRITTEN, +} xfs_exntst_t; + +typedef struct xfs_bmbt_irec +{ + xfs_fileoff_t br_startoff; /* starting file offset */ + xfs_fsblock_t br_startblock; /* starting block number */ + xfs_filblks_t br_blockcount; /* number of blocks */ + xfs_exntst_t br_state; /* extent state */ +} xfs_bmbt_irec_t; #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c new file mode 100644 index 000000000000..2a9b4f9e93c6 --- /dev/null +++ b/fs/xfs/scrub/agheader.c @@ -0,0 +1,658 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * Set up scrub to check all the static metadata in each AG. + * This means the SB, AGF, AGI, and AGFL headers. + */ +int +xfs_scrub_setup_ag_header( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = sc->mp; + + if (sc->sm->sm_agno >= mp->m_sb.sb_agcount || + sc->sm->sm_ino || sc->sm->sm_gen) + return -EINVAL; + return xfs_scrub_setup_fs(sc, ip); +} + +/* Walk all the blocks in the AGFL. */ +int +xfs_scrub_walk_agfl( + struct xfs_scrub_context *sc, + int (*fn)(struct xfs_scrub_context *, + xfs_agblock_t bno, void *), + void *priv) +{ + struct xfs_agf *agf; + __be32 *agfl_bno; + struct xfs_mount *mp = sc->mp; + unsigned int flfirst; + unsigned int fllast; + int i; + int error; + + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp); + flfirst = be32_to_cpu(agf->agf_flfirst); + fllast = be32_to_cpu(agf->agf_fllast); + + /* Nothing to walk in an empty AGFL. */ + if (agf->agf_flcount == cpu_to_be32(0)) + return 0; + + /* first to last is a consecutive list. */ + if (fllast >= flfirst) { + for (i = flfirst; i <= fllast; i++) { + error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); + if (error) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return error; + } + + return 0; + } + + /* first to the end */ + for (i = flfirst; i < XFS_AGFL_SIZE(mp); i++) { + error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); + if (error) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return error; + } + + /* the start to last. */ + for (i = 0; i <= fllast; i++) { + error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); + if (error) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return error; + } + + return 0; +} + +/* Superblock */ + +/* + * Scrub the filesystem superblock. + * + * Note: We do /not/ attempt to check AG 0's superblock. Mount is + * responsible for validating all the geometry information in sb 0, so + * if the filesystem is capable of initiating online scrub, then clearly + * sb 0 is ok and we can use its information to check everything else. + */ +int +xfs_scrub_superblock( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + struct xfs_dsb *sb; + xfs_agnumber_t agno; + uint32_t v2_ok; + __be32 features_mask; + int error; + __be16 vernum_mask; + + agno = sc->sm->sm_agno; + if (agno == 0) + return 0; + + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + if (!xfs_scrub_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) + return error; + + sb = XFS_BUF_TO_SBP(bp); + + /* + * Verify the geometries match. Fields that are permanently + * set by mkfs are checked; fields that can be updated later + * (and are not propagated to backup superblocks) are preen + * checked. + */ + if (sb->sb_blocksize != cpu_to_be32(mp->m_sb.sb_blocksize)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_dblocks != cpu_to_be64(mp->m_sb.sb_dblocks)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_rblocks != cpu_to_be64(mp->m_sb.sb_rblocks)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_rextents != cpu_to_be64(mp->m_sb.sb_rextents)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (!uuid_equal(&sb->sb_uuid, &mp->m_sb.sb_uuid)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_logstart != cpu_to_be64(mp->m_sb.sb_logstart)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_agblocks != cpu_to_be32(mp->m_sb.sb_agblocks)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_agcount != cpu_to_be32(mp->m_sb.sb_agcount)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_rbmblocks != cpu_to_be32(mp->m_sb.sb_rbmblocks)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_logblocks != cpu_to_be32(mp->m_sb.sb_logblocks)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check sb_versionnum bits that are set at mkfs time. */ + vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS | + XFS_SB_VERSION_NUMBITS | + XFS_SB_VERSION_ALIGNBIT | + XFS_SB_VERSION_DALIGNBIT | + XFS_SB_VERSION_SHAREDBIT | + XFS_SB_VERSION_LOGV2BIT | + XFS_SB_VERSION_SECTORBIT | + XFS_SB_VERSION_EXTFLGBIT | + XFS_SB_VERSION_DIRV2BIT); + if ((sb->sb_versionnum & vernum_mask) != + (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check sb_versionnum bits that can be set after mkfs time. */ + vernum_mask = cpu_to_be16(XFS_SB_VERSION_ATTRBIT | + XFS_SB_VERSION_NLINKBIT | + XFS_SB_VERSION_QUOTABIT); + if ((sb->sb_versionnum & vernum_mask) != + (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_sectsize != cpu_to_be16(mp->m_sb.sb_sectsize)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_inodesize != cpu_to_be16(mp->m_sb.sb_inodesize)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_inopblock != cpu_to_be16(mp->m_sb.sb_inopblock)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (memcmp(sb->sb_fname, mp->m_sb.sb_fname, sizeof(sb->sb_fname))) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_blocklog != mp->m_sb.sb_blocklog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_sectlog != mp->m_sb.sb_sectlog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_inodelog != mp->m_sb.sb_inodelog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_inopblog != mp->m_sb.sb_inopblog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_agblklog != mp->m_sb.sb_agblklog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_rextslog != mp->m_sb.sb_rextslog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_imax_pct != mp->m_sb.sb_imax_pct) + xfs_scrub_block_set_preen(sc, bp); + + /* + * Skip the summary counters since we track them in memory anyway. + * sb_icount, sb_ifree, sb_fdblocks, sb_frexents + */ + + if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino)) + xfs_scrub_block_set_preen(sc, bp); + + /* + * Skip the quota flags since repair will force quotacheck. + * sb_qflags + */ + + if (sb->sb_flags != mp->m_sb.sb_flags) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_shared_vn != mp->m_sb.sb_shared_vn) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_inoalignmt != cpu_to_be32(mp->m_sb.sb_inoalignmt)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_unit != cpu_to_be32(mp->m_sb.sb_unit)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_width != cpu_to_be32(mp->m_sb.sb_width)) + xfs_scrub_block_set_preen(sc, bp); + + if (sb->sb_dirblklog != mp->m_sb.sb_dirblklog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_logsectlog != mp->m_sb.sb_logsectlog) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_logsectsize != cpu_to_be16(mp->m_sb.sb_logsectsize)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_logsunit != cpu_to_be32(mp->m_sb.sb_logsunit)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Do we see any invalid bits in sb_features2? */ + if (!xfs_sb_version_hasmorebits(&mp->m_sb)) { + if (sb->sb_features2 != 0) + xfs_scrub_block_set_corrupt(sc, bp); + } else { + v2_ok = XFS_SB_VERSION2_OKBITS; + if (XFS_SB_VERSION_NUM(&mp->m_sb) >= XFS_SB_VERSION_5) + v2_ok |= XFS_SB_VERSION2_CRCBIT; + + if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok))) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_features2 != sb->sb_bad_features2) + xfs_scrub_block_set_preen(sc, bp); + } + + /* Check sb_features2 flags that are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_VERSION2_LAZYSBCOUNTBIT | + XFS_SB_VERSION2_PROJID32BIT | + XFS_SB_VERSION2_CRCBIT | + XFS_SB_VERSION2_FTYPE); + if ((sb->sb_features2 & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check sb_features2 flags that can be set after mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT); + if ((sb->sb_features2 & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + /* all v5 fields must be zero */ + if (memchr_inv(&sb->sb_features_compat, 0, + sizeof(struct xfs_dsb) - + offsetof(struct xfs_dsb, sb_features_compat))) + xfs_scrub_block_set_corrupt(sc, bp); + } else { + /* Check compat flags; all are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN); + if ((sb->sb_features_compat & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check ro compat flags; all are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN | + XFS_SB_FEAT_RO_COMPAT_FINOBT | + XFS_SB_FEAT_RO_COMPAT_RMAPBT | + XFS_SB_FEAT_RO_COMPAT_REFLINK); + if ((sb->sb_features_ro_compat & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features_ro_compat) & + features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check incompat flags; all are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN | + XFS_SB_FEAT_INCOMPAT_FTYPE | + XFS_SB_FEAT_INCOMPAT_SPINODES | + XFS_SB_FEAT_INCOMPAT_META_UUID); + if ((sb->sb_features_incompat & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features_incompat) & + features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Check log incompat flags; all are set at mkfs time. */ + features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN); + if ((sb->sb_features_log_incompat & features_mask) != + (cpu_to_be32(mp->m_sb.sb_features_log_incompat) & + features_mask)) + xfs_scrub_block_set_corrupt(sc, bp); + + /* Don't care about sb_crc */ + + if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) + xfs_scrub_block_set_corrupt(sc, bp); + + if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino)) + xfs_scrub_block_set_preen(sc, bp); + + /* Don't care about sb_lsn */ + } + + if (xfs_sb_version_hasmetauuid(&mp->m_sb)) { + /* The metadata UUID must be the same for all supers */ + if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid)) + xfs_scrub_block_set_corrupt(sc, bp); + } + + /* Everything else must be zero. */ + if (memchr_inv(sb + 1, 0, + BBTOB(bp->b_length) - sizeof(struct xfs_dsb))) + xfs_scrub_block_set_corrupt(sc, bp); + + return error; +} + +/* AGF */ + +/* Scrub the AGF. */ +int +xfs_scrub_agf( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_agf *agf; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_agblock_t eoag; + xfs_agblock_t agfl_first; + xfs_agblock_t agfl_last; + xfs_agblock_t agfl_count; + xfs_agblock_t fl_count; + int level; + int error = 0; + + agno = sc->sa.agno = sc->sm->sm_agno; + error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp, + &sc->sa.agf_bp, &sc->sa.agfl_bp); + if (!xfs_scrub_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error)) + goto out; + + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + + /* Check the AG length */ + eoag = be32_to_cpu(agf->agf_length); + if (eoag != xfs_ag_block_count(mp, agno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + /* Check the AGF btree roots and levels */ + agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + } + + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + agbno = be32_to_cpu(agf->agf_refcount_root); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + + level = be32_to_cpu(agf->agf_refcount_level); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + } + + /* Check the AGFL counters */ + agfl_first = be32_to_cpu(agf->agf_flfirst); + agfl_last = be32_to_cpu(agf->agf_fllast); + agfl_count = be32_to_cpu(agf->agf_flcount); + if (agfl_last > agfl_first) + fl_count = agfl_last - agfl_first + 1; + else + fl_count = XFS_AGFL_SIZE(mp) - agfl_first + agfl_last + 1; + if (agfl_count != 0 && fl_count != agfl_count) + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + +out: + return error; +} + +/* AGFL */ + +struct xfs_scrub_agfl_info { + unsigned int sz_entries; + unsigned int nr_entries; + xfs_agblock_t *entries; +}; + +/* Scrub an AGFL block. */ +STATIC int +xfs_scrub_agfl_block( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno, + void *priv) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_scrub_agfl_info *sai = priv; + xfs_agnumber_t agno = sc->sa.agno; + + if (xfs_verify_agbno(mp, agno, agbno) && + sai->nr_entries < sai->sz_entries) + sai->entries[sai->nr_entries++] = agbno; + else + xfs_scrub_block_set_corrupt(sc, sc->sa.agfl_bp); + + return 0; +} + +static int +xfs_scrub_agblock_cmp( + const void *pa, + const void *pb) +{ + const xfs_agblock_t *a = pa; + const xfs_agblock_t *b = pb; + + return (int)*a - (int)*b; +} + +/* Scrub the AGFL. */ +int +xfs_scrub_agfl( + struct xfs_scrub_context *sc) +{ + struct xfs_scrub_agfl_info sai = { 0 }; + struct xfs_agf *agf; + xfs_agnumber_t agno; + unsigned int agflcount; + unsigned int i; + int error; + + agno = sc->sa.agno = sc->sm->sm_agno; + error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp, + &sc->sa.agf_bp, &sc->sa.agfl_bp); + if (!xfs_scrub_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) + goto out; + if (!sc->sa.agf_bp) + return -EFSCORRUPTED; + + /* Allocate buffer to ensure uniqueness of AGFL entries. */ + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agflcount = be32_to_cpu(agf->agf_flcount); + if (agflcount > XFS_AGFL_SIZE(sc->mp)) { + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + goto out; + } + sai.sz_entries = agflcount; + sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS); + if (!sai.entries) { + error = -ENOMEM; + goto out; + } + + /* Check the blocks in the AGFL. */ + error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai); + if (error) + goto out_free; + + if (agflcount != sai.nr_entries) { + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + goto out_free; + } + + /* Sort entries, check for duplicates. */ + sort(sai.entries, sai.nr_entries, sizeof(sai.entries[0]), + xfs_scrub_agblock_cmp, NULL); + for (i = 1; i < sai.nr_entries; i++) { + if (sai.entries[i] == sai.entries[i - 1]) { + xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); + break; + } + } + +out_free: + kmem_free(sai.entries); +out: + return error; +} + +/* AGI */ + +/* Scrub the AGI. */ +int +xfs_scrub_agi( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_agi *agi; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_agblock_t eoag; + xfs_agino_t agino; + xfs_agino_t first_agino; + xfs_agino_t last_agino; + xfs_agino_t icount; + int i; + int level; + int error = 0; + + agno = sc->sa.agno = sc->sm->sm_agno; + error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp, + &sc->sa.agf_bp, &sc->sa.agfl_bp); + if (!xfs_scrub_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error)) + goto out; + + agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + + /* Check the AG length */ + eoag = be32_to_cpu(agi->agi_length); + if (eoag != xfs_ag_block_count(mp, agno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + /* Check btree roots and levels */ + agbno = be32_to_cpu(agi->agi_root); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + level = be32_to_cpu(agi->agi_level); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agbno = be32_to_cpu(agi->agi_free_root); + if (!xfs_verify_agbno(mp, agno, agbno)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + level = be32_to_cpu(agi->agi_free_level); + if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + } + + /* Check inode counters */ + xfs_ialloc_agino_range(mp, agno, &first_agino, &last_agino); + icount = be32_to_cpu(agi->agi_count); + if (icount > last_agino - first_agino + 1 || + icount < be32_to_cpu(agi->agi_freecount)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + /* Check inode pointers */ + agino = be32_to_cpu(agi->agi_newino); + if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + agino = be32_to_cpu(agi->agi_dirino); + if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + + /* Check unlinked inode buckets */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + agino = be32_to_cpu(agi->agi_unlinked[i]); + if (agino == NULLAGINO) + continue; + if (!xfs_verify_agino(mp, agno, agino)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + } + + if (agi->agi_pad32 != cpu_to_be32(0)) + xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp); + +out: + return error; +} diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c new file mode 100644 index 000000000000..059663e13414 --- /dev/null +++ b/fs/xfs/scrub/alloc.c @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* + * Set us up to scrub free space btrees. + */ +int +xfs_scrub_setup_ag_allocbt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_ag_btree(sc, ip, false); +} + +/* Free space btree scrubber. */ + +/* Scrub a bnobt/cntbt record. */ +STATIC int +xfs_scrub_allocbt_rec( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agblock_t bno; + xfs_extlen_t len; + int error = 0; + + bno = be32_to_cpu(rec->alloc.ar_startblock); + len = be32_to_cpu(rec->alloc.ar_blockcount); + + if (bno + len <= bno || + !xfs_verify_agbno(mp, agno, bno) || + !xfs_verify_agbno(mp, agno, bno + len - 1)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + return error; +} + +/* Scrub the freespace btrees for some AG. */ +STATIC int +xfs_scrub_allocbt( + struct xfs_scrub_context *sc, + xfs_btnum_t which) +{ + struct xfs_owner_info oinfo; + struct xfs_btree_cur *cur; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; + return xfs_scrub_btree(sc, cur, xfs_scrub_allocbt_rec, &oinfo, NULL); +} + +int +xfs_scrub_bnobt( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_allocbt(sc, XFS_BTNUM_BNO); +} + +int +xfs_scrub_cntbt( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_allocbt(sc, XFS_BTNUM_CNT); +} diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c new file mode 100644 index 000000000000..4ed80474f545 --- /dev/null +++ b/fs/xfs/scrub/attr.c @@ -0,0 +1,471 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/dabtree.h" +#include "scrub/trace.h" + +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +/* Set us up to scrub an inode's extended attributes. */ +int +xfs_scrub_setup_xattr( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + size_t sz; + + /* + * Allocate the buffer without the inode lock held. We need enough + * space to read every xattr value in the file or enough space to + * hold three copies of the xattr free space bitmap. (Not both at + * the same time.) + */ + sz = max_t(size_t, XATTR_SIZE_MAX, 3 * sizeof(long) * + BITS_TO_LONGS(sc->mp->m_attr_geo->blksize)); + sc->buf = kmem_zalloc_large(sz, KM_SLEEP); + if (!sc->buf) + return -ENOMEM; + + return xfs_scrub_setup_inode_contents(sc, ip, 0); +} + +/* Extended Attributes */ + +struct xfs_scrub_xattr { + struct xfs_attr_list_context context; + struct xfs_scrub_context *sc; +}; + +/* + * Check that an extended attribute key can be looked up by hash. + * + * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked) + * to call this function for every attribute key in an inode. Once + * we're here, we load the attribute value to see if any errors happen, + * or if we get more or less data than we expected. + */ +static void +xfs_scrub_xattr_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen) +{ + struct xfs_scrub_xattr *sx; + struct xfs_da_args args = { NULL }; + int error = 0; + + sx = container_of(context, struct xfs_scrub_xattr, context); + + if (flags & XFS_ATTR_INCOMPLETE) { + /* Incomplete attr key, just mark the inode for preening. */ + xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino, NULL); + return; + } + + args.flags = ATTR_KERNOTIME; + if (flags & XFS_ATTR_ROOT) + args.flags |= ATTR_ROOT; + else if (flags & XFS_ATTR_SECURE) + args.flags |= ATTR_SECURE; + args.geo = context->dp->i_mount->m_attr_geo; + args.whichfork = XFS_ATTR_FORK; + args.dp = context->dp; + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.trans = context->tp; + args.value = sx->sc->buf; + args.valuelen = XATTR_SIZE_MAX; + + error = xfs_attr_get_ilocked(context->dp, &args); + if (error == -EEXIST) + error = 0; + if (!xfs_scrub_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, + &error)) + goto fail_xref; + if (args.valuelen != valuelen) + xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, + args.blkno); + +fail_xref: + return; +} + +/* + * Mark a range [start, start+len) in this map. Returns true if the + * region was free, and false if there's a conflict or a problem. + * + * Within a char, the lowest bit of the char represents the byte with + * the smallest address + */ +STATIC bool +xfs_scrub_xattr_set_map( + struct xfs_scrub_context *sc, + unsigned long *map, + unsigned int start, + unsigned int len) +{ + unsigned int mapsize = sc->mp->m_attr_geo->blksize; + bool ret = true; + + if (start >= mapsize) + return false; + if (start + len > mapsize) { + len = mapsize - start; + ret = false; + } + + if (find_next_bit(map, mapsize, start) < start + len) + ret = false; + bitmap_set(map, start, len); + + return ret; +} + +/* + * Check the leaf freemap from the usage bitmap. Returns false if the + * attr freemap has problems or points to used space. + */ +STATIC bool +xfs_scrub_xattr_check_freemap( + struct xfs_scrub_context *sc, + unsigned long *map, + struct xfs_attr3_icleaf_hdr *leafhdr) +{ + unsigned long *freemap; + unsigned long *dstmap; + unsigned int mapsize = sc->mp->m_attr_geo->blksize; + int i; + + /* Construct bitmap of freemap contents. */ + freemap = (unsigned long *)sc->buf + BITS_TO_LONGS(mapsize); + bitmap_zero(freemap, mapsize); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + if (!xfs_scrub_xattr_set_map(sc, freemap, + leafhdr->freemap[i].base, + leafhdr->freemap[i].size)) + return false; + } + + /* Look for bits that are set in freemap and are marked in use. */ + dstmap = freemap + BITS_TO_LONGS(mapsize); + return bitmap_and(dstmap, freemap, map, mapsize) == 0; +} + +/* + * Check this leaf entry's relations to everything else. + * Returns the number of bytes used for the name/value data. + */ +STATIC void +xfs_scrub_xattr_entry( + struct xfs_scrub_da_btree *ds, + int level, + char *buf_end, + struct xfs_attr_leafblock *leaf, + struct xfs_attr3_icleaf_hdr *leafhdr, + unsigned long *usedmap, + struct xfs_attr_leaf_entry *ent, + int idx, + unsigned int *usedbytes, + __u32 *last_hashval) +{ + struct xfs_mount *mp = ds->state->mp; + char *name_end; + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + unsigned int nameidx; + unsigned int namesize; + + if (ent->pad2 != 0) + xfs_scrub_da_set_corrupt(ds, level); + + /* Hash values in order? */ + if (be32_to_cpu(ent->hashval) < *last_hashval) + xfs_scrub_da_set_corrupt(ds, level); + *last_hashval = be32_to_cpu(ent->hashval); + + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < leafhdr->firstused || + nameidx >= mp->m_attr_geo->blksize) { + xfs_scrub_da_set_corrupt(ds, level); + return; + } + + /* Check the name information. */ + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = xfs_attr3_leaf_name_local(leaf, idx); + namesize = xfs_attr_leaf_entsize_local(lentry->namelen, + be16_to_cpu(lentry->valuelen)); + name_end = (char *)lentry + namesize; + if (lentry->namelen == 0) + xfs_scrub_da_set_corrupt(ds, level); + } else { + rentry = xfs_attr3_leaf_name_remote(leaf, idx); + namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); + name_end = (char *)rentry + namesize; + if (rentry->namelen == 0 || rentry->valueblk == 0) + xfs_scrub_da_set_corrupt(ds, level); + } + if (name_end > buf_end) + xfs_scrub_da_set_corrupt(ds, level); + + if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, nameidx, namesize)) + xfs_scrub_da_set_corrupt(ds, level); + if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + *usedbytes += namesize; +} + +/* Scrub an attribute leaf. */ +STATIC int +xfs_scrub_xattr_block( + struct xfs_scrub_da_btree *ds, + int level) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_mount *mp = ds->state->mp; + struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; + struct xfs_buf *bp = blk->bp; + xfs_dablk_t *last_checked = ds->private; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *ent; + struct xfs_attr_leaf_entry *entries; + unsigned long *usedmap = ds->sc->buf; + char *buf_end; + size_t off; + __u32 last_hashval = 0; + unsigned int usedbytes = 0; + unsigned int hdrsize; + int i; + + if (*last_checked == blk->blkno) + return 0; + *last_checked = blk->blkno; + bitmap_zero(usedmap, mp->m_attr_geo->blksize); + + /* Check all the padding. */ + if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb)) { + struct xfs_attr3_leafblock *leaf = bp->b_addr; + + if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 || + leaf->hdr.info.hdr.pad != 0) + xfs_scrub_da_set_corrupt(ds, level); + } else { + if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0) + xfs_scrub_da_set_corrupt(ds, level); + } + + /* Check the leaf header */ + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + hdrsize = xfs_attr3_leaf_hdr_size(leaf); + + if (leafhdr.usedbytes > mp->m_attr_geo->blksize) + xfs_scrub_da_set_corrupt(ds, level); + if (leafhdr.firstused > mp->m_attr_geo->blksize) + xfs_scrub_da_set_corrupt(ds, level); + if (leafhdr.firstused < hdrsize) + xfs_scrub_da_set_corrupt(ds, level); + if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, 0, hdrsize)) + xfs_scrub_da_set_corrupt(ds, level); + + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + entries = xfs_attr3_leaf_entryp(leaf); + if ((char *)&entries[leafhdr.count] > (char *)leaf + leafhdr.firstused) + xfs_scrub_da_set_corrupt(ds, level); + + buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; + for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) { + /* Mark the leaf entry itself. */ + off = (char *)ent - (char *)leaf; + if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, off, + sizeof(xfs_attr_leaf_entry_t))) { + xfs_scrub_da_set_corrupt(ds, level); + goto out; + } + + /* Check the entry and nameval. */ + xfs_scrub_xattr_entry(ds, level, buf_end, leaf, &leafhdr, + usedmap, ent, i, &usedbytes, &last_hashval); + + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + } + + if (!xfs_scrub_xattr_check_freemap(ds->sc, usedmap, &leafhdr)) + xfs_scrub_da_set_corrupt(ds, level); + + if (leafhdr.usedbytes != usedbytes) + xfs_scrub_da_set_corrupt(ds, level); + +out: + return 0; +} + +/* Scrub a attribute btree record. */ +STATIC int +xfs_scrub_xattr_rec( + struct xfs_scrub_da_btree *ds, + int level, + void *rec) +{ + struct xfs_mount *mp = ds->state->mp; + struct xfs_attr_leaf_entry *ent = rec; + struct xfs_da_state_blk *blk; + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + struct xfs_buf *bp; + xfs_dahash_t calc_hash; + xfs_dahash_t hash; + int nameidx; + int hdrsize; + unsigned int badflags; + int error; + + blk = &ds->state->path.blk[level]; + + /* Check the whole block, if necessary. */ + error = xfs_scrub_xattr_block(ds, level); + if (error) + goto out; + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Check the hash of the entry. */ + error = xfs_scrub_da_btree_hash(ds, level, &ent->hashval); + if (error) + goto out; + + /* Find the attr entry's location. */ + bp = blk->bp; + hdrsize = xfs_attr3_leaf_hdr_size(bp->b_addr); + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < hdrsize || nameidx >= mp->m_attr_geo->blksize) { + xfs_scrub_da_set_corrupt(ds, level); + goto out; + } + + /* Retrieve the entry and check it. */ + hash = be32_to_cpu(ent->hashval); + badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE | + XFS_ATTR_INCOMPLETE); + if ((ent->flags & badflags) != 0) + xfs_scrub_da_set_corrupt(ds, level); + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = (struct xfs_attr_leaf_name_local *) + (((char *)bp->b_addr) + nameidx); + if (lentry->namelen <= 0) { + xfs_scrub_da_set_corrupt(ds, level); + goto out; + } + calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen); + } else { + rentry = (struct xfs_attr_leaf_name_remote *) + (((char *)bp->b_addr) + nameidx); + if (rentry->namelen <= 0) { + xfs_scrub_da_set_corrupt(ds, level); + goto out; + } + calc_hash = xfs_da_hashname(rentry->name, rentry->namelen); + } + if (calc_hash != hash) + xfs_scrub_da_set_corrupt(ds, level); + +out: + return error; +} + +/* Scrub the extended attribute metadata. */ +int +xfs_scrub_xattr( + struct xfs_scrub_context *sc) +{ + struct xfs_scrub_xattr sx; + struct attrlist_cursor_kern cursor = { 0 }; + xfs_dablk_t last_checked = -1U; + int error = 0; + + if (!xfs_inode_hasattr(sc->ip)) + return -ENOENT; + + memset(&sx, 0, sizeof(sx)); + /* Check attribute tree structure */ + error = xfs_scrub_da_btree(sc, XFS_ATTR_FORK, xfs_scrub_xattr_rec, + &last_checked); + if (error) + goto out; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Check that every attr key can also be looked up by hash. */ + sx.context.dp = sc->ip; + sx.context.cursor = &cursor; + sx.context.resynch = 1; + sx.context.put_listent = xfs_scrub_xattr_listent; + sx.context.tp = sc->tp; + sx.context.flags = ATTR_INCOMPLETE; + sx.sc = sc; + + /* + * Look up every xattr in this file by name. + * + * Use the backend implementation of xfs_attr_list to call + * xfs_scrub_xattr_listent on every attribute key in this inode. + * In other words, we use the same iterator/callback mechanism + * that listattr uses to scrub extended attributes, though in our + * _listent function, we check the value of the attribute. + * + * The VFS only locks i_rwsem when modifying attrs, so keep all + * three locks held because that's the only way to ensure we're + * the only thread poking into the da btree. We traverse the da + * btree while holding a leaf buffer locked for the xattr name + * iteration, which doesn't really follow the usual buffer + * locking order. + */ + error = xfs_attr_list_int_ilocked(&sx.context); + if (!xfs_scrub_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) + goto out; +out: + return error; +} diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c new file mode 100644 index 000000000000..42fec0bcd9e1 --- /dev/null +++ b/fs/xfs/scrub/bmap.c @@ -0,0 +1,363 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* Set us up with an inode's bmap. */ +int +xfs_scrub_setup_inode_bmap( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = sc->mp; + int error; + + error = xfs_scrub_get_inode(sc, ip); + if (error) + goto out; + + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + + /* + * We don't want any ephemeral data fork updates sitting around + * while we inspect block mappings, so wait for directio to finish + * and flush dirty data if we have delalloc reservations. + */ + if (S_ISREG(VFS_I(sc->ip)->i_mode) && + sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { + inode_dio_wait(VFS_I(sc->ip)); + error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping); + if (error) + goto out; + } + + /* Got the inode, lock it and we're ready to go. */ + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + if (error) + goto out; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + +out: + /* scrub teardown will unlock and release the inode */ + return error; +} + +/* + * Inode fork block mapping (BMBT) scrubber. + * More complex than the others because we have to scrub + * all the extents regardless of whether or not the fork + * is in btree format. + */ + +struct xfs_scrub_bmap_info { + struct xfs_scrub_context *sc; + xfs_fileoff_t lastoff; + bool is_rt; + bool is_shared; + int whichfork; +}; + +/* Scrub a single extent record. */ +STATIC int +xfs_scrub_bmap_extent( + struct xfs_inode *ip, + struct xfs_btree_cur *cur, + struct xfs_scrub_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = info->sc->mp; + struct xfs_buf *bp = NULL; + int error = 0; + + if (cur) + xfs_btree_get_block(cur, 0, &bp); + + /* + * Check for out-of-order extents. This record could have come + * from the incore list, for which there is no ordering check. + */ + if (irec->br_startoff < info->lastoff) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* There should never be a "hole" extent in either extent list. */ + if (irec->br_startblock == HOLESTARTBLOCK) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* + * Check for delalloc extents. We never iterate the ones in the + * in-core extent scan, and we should never see these in the bmbt. + */ + if (isnullstartblock(irec->br_startblock)) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Make sure the extent points to a valid place. */ + if (irec->br_startblock + irec->br_blockcount <= irec->br_startblock) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (info->is_rt && + (!xfs_verify_rtbno(mp, irec->br_startblock) || + !xfs_verify_rtbno(mp, irec->br_startblock + + irec->br_blockcount - 1))) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (!info->is_rt && + (!xfs_verify_fsbno(mp, irec->br_startblock) || + !xfs_verify_fsbno(mp, irec->br_startblock + + irec->br_blockcount - 1))) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* We don't allow unwritten extents on attr forks. */ + if (irec->br_state == XFS_EXT_UNWRITTEN && + info->whichfork == XFS_ATTR_FORK) + xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + info->lastoff = irec->br_startoff + irec->br_blockcount; + return error; +} + +/* Scrub a bmbt record. */ +STATIC int +xfs_scrub_bmapbt_rec( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_bmbt_irec irec; + struct xfs_scrub_bmap_info *info = bs->private; + struct xfs_inode *ip = bs->cur->bc_private.b.ip; + struct xfs_buf *bp = NULL; + struct xfs_btree_block *block; + uint64_t owner; + int i; + + /* + * Check the owners of the btree blocks up to the level below + * the root since the verifiers don't do that. + */ + if (xfs_sb_version_hascrc(&bs->cur->bc_mp->m_sb) && + bs->cur->bc_ptrs[0] == 1) { + for (i = 0; i < bs->cur->bc_nlevels - 1; i++) { + block = xfs_btree_get_block(bs->cur, i, &bp); + owner = be64_to_cpu(block->bb_u.l.bb_owner); + if (owner != ip->i_ino) + xfs_scrub_fblock_set_corrupt(bs->sc, + info->whichfork, 0); + } + } + + /* Set up the in-core record and scrub it. */ + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + return xfs_scrub_bmap_extent(ip, bs->cur, info, &irec); +} + +/* Scan the btree records. */ +STATIC int +xfs_scrub_bmap_btree( + struct xfs_scrub_context *sc, + int whichfork, + struct xfs_scrub_bmap_info *info) +{ + struct xfs_owner_info oinfo; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xfs_btree_cur *cur; + int error; + + cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); + error = xfs_scrub_btree(sc, cur, xfs_scrub_bmapbt_rec, &oinfo, info); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : + XFS_BTREE_NOERROR); + return error; +} + +/* + * Scrub an inode fork's block mappings. + * + * First we scan every record in every btree block, if applicable. + * Then we unconditionally scan the incore extent cache. + */ +STATIC int +xfs_scrub_bmap( + struct xfs_scrub_context *sc, + int whichfork) +{ + struct xfs_bmbt_irec irec; + struct xfs_scrub_bmap_info info = { NULL }; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xfs_ifork *ifp; + xfs_fileoff_t endoff; + struct xfs_iext_cursor icur; + bool found; + int error = 0; + + ifp = XFS_IFORK_PTR(ip, whichfork); + + info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); + info.whichfork = whichfork; + info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip); + info.sc = sc; + + switch (whichfork) { + case XFS_COW_FORK: + /* Non-existent CoW forks are ignorable. */ + if (!ifp) + goto out; + /* No CoW forks on non-reflink inodes/filesystems. */ + if (!xfs_is_reflink_inode(ip)) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + goto out; + } + break; + case XFS_ATTR_FORK: + if (!ifp) + goto out; + if (!xfs_sb_version_hasattr(&mp->m_sb) && + !xfs_sb_version_hasattr2(&mp->m_sb)) + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + break; + default: + ASSERT(whichfork == XFS_DATA_FORK); + break; + } + + /* Check the fork values */ + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_UUID: + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_LOCAL: + /* No mappings to check. */ + goto out; + case XFS_DINODE_FMT_EXTENTS: + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + xfs_scrub_fblock_set_corrupt(sc, whichfork, 0); + goto out; + } + break; + case XFS_DINODE_FMT_BTREE: + if (whichfork == XFS_COW_FORK) { + xfs_scrub_fblock_set_corrupt(sc, whichfork, 0); + goto out; + } + + error = xfs_scrub_bmap_btree(sc, whichfork, &info); + if (error) + goto out; + break; + default: + xfs_scrub_fblock_set_corrupt(sc, whichfork, 0); + goto out; + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Now try to scrub the in-memory extent list. */ + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(sc->tp, ip, whichfork); + if (!xfs_scrub_fblock_process_error(sc, whichfork, 0, &error)) + goto out; + } + + /* Find the offset of the last extent in the mapping. */ + error = xfs_bmap_last_offset(ip, &endoff, whichfork); + if (!xfs_scrub_fblock_process_error(sc, whichfork, 0, &error)) + goto out; + + /* Scrub extent records. */ + info.lastoff = 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + for (found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &irec); + found != 0; + found = xfs_iext_next_extent(ifp, &icur, &irec)) { + if (xfs_scrub_should_terminate(sc, &error)) + break; + if (isnullstartblock(irec.br_startblock)) + continue; + if (irec.br_startoff >= endoff) { + xfs_scrub_fblock_set_corrupt(sc, whichfork, + irec.br_startoff); + goto out; + } + error = xfs_scrub_bmap_extent(ip, NULL, &info, &irec); + if (error) + goto out; + } + +out: + return error; +} + +/* Scrub an inode's data fork. */ +int +xfs_scrub_bmap_data( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_bmap(sc, XFS_DATA_FORK); +} + +/* Scrub an inode's attr fork. */ +int +xfs_scrub_bmap_attr( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_bmap(sc, XFS_ATTR_FORK); +} + +/* Scrub an inode's CoW fork. */ +int +xfs_scrub_bmap_cow( + struct xfs_scrub_context *sc) +{ + if (!xfs_is_reflink_inode(sc->ip)) + return -ENOENT; + + return xfs_scrub_bmap(sc, XFS_COW_FORK); +} diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c new file mode 100644 index 000000000000..df0766132ace --- /dev/null +++ b/fs/xfs/scrub/btree.c @@ -0,0 +1,516 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* btree scrubbing */ + +/* + * Check for btree operation errors. See the section about handling + * operational errors in common.c. + */ +bool +xfs_scrub_btree_process_error( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level, + int *error) +{ + if (*error == 0) + return true; + + switch (*error) { + case -EDEADLOCK: + /* Used to restart an op with deadlock avoidance. */ + trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + *error = 0; + /* fall through */ + default: + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + trace_xfs_scrub_ifork_btree_op_error(sc, cur, level, + *error, __return_address); + else + trace_xfs_scrub_btree_op_error(sc, cur, level, + *error, __return_address); + break; + } + return false; +} + +/* Record btree block corruption. */ +void +xfs_scrub_btree_set_corrupt( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + int level) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + trace_xfs_scrub_ifork_btree_error(sc, cur, level, + __return_address); + else + trace_xfs_scrub_btree_error(sc, cur, level, + __return_address); +} + +/* + * Make sure this record is in order and doesn't stray outside of the parent + * keys. + */ +STATIC void +xfs_scrub_btree_rec( + struct xfs_scrub_btree *bs) +{ + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_rec *rec; + union xfs_btree_key key; + union xfs_btree_key hkey; + union xfs_btree_key *keyp; + struct xfs_btree_block *block; + struct xfs_btree_block *keyblock; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, 0, &bp); + rec = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); + + trace_xfs_scrub_btree_rec(bs->sc, cur, 0); + + /* If this isn't the first record, are they in order? */ + if (!bs->firstrec && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec)) + xfs_scrub_btree_set_corrupt(bs->sc, cur, 0); + bs->firstrec = false; + memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len); + + if (cur->bc_nlevels == 1) + return; + + /* Is this at least as large as the parent low key? */ + cur->bc_ops->init_key_from_rec(&key, rec); + keyblock = xfs_btree_get_block(cur, 1, &bp); + keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[1], keyblock); + if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, 1); + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Is this no larger than the parent high key? */ + cur->bc_ops->init_high_key_from_rec(&hkey, rec); + keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[1], keyblock); + if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, 1); +} + +/* + * Make sure this key is in order and doesn't stray outside of the parent + * keys. + */ +STATIC void +xfs_scrub_btree_key( + struct xfs_scrub_btree *bs, + int level) +{ + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_key *key; + union xfs_btree_key *keyp; + struct xfs_btree_block *block; + struct xfs_btree_block *keyblock; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, level, &bp); + key = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block); + + trace_xfs_scrub_btree_key(bs->sc, cur, level); + + /* If this isn't the first key, are they in order? */ + if (!bs->firstkey[level] && + !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level], key)) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); + bs->firstkey[level] = false; + memcpy(&bs->lastkey[level], key, cur->bc_ops->key_len); + + if (level + 1 >= cur->bc_nlevels) + return; + + /* Is this at least as large as the parent low key? */ + keyblock = xfs_btree_get_block(cur, level + 1, &bp); + keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], keyblock); + if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Is this no larger than the parent high key? */ + key = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block); + keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], keyblock); + if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); +} + +/* + * Check a btree pointer. Returns true if it's ok to use this pointer. + * Callers do not need to set the corrupt flag. + */ +static bool +xfs_scrub_btree_ptr_ok( + struct xfs_scrub_btree *bs, + int level, + union xfs_btree_ptr *ptr) +{ + bool res; + + /* A btree rooted in an inode has no block pointer to the root. */ + if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == bs->cur->bc_nlevels) + return true; + + /* Otherwise, check the pointers. */ + if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) + res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level); + else + res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level); + if (!res) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level); + + return res; +} + +/* Check that a btree block's sibling matches what we expect it. */ +STATIC int +xfs_scrub_btree_block_check_sibling( + struct xfs_scrub_btree *bs, + int level, + int direction, + union xfs_btree_ptr *sibling) +{ + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *pblock; + struct xfs_buf *pbp; + struct xfs_btree_cur *ncur = NULL; + union xfs_btree_ptr *pp; + int success; + int error; + + error = xfs_btree_dup_cursor(cur, &ncur); + if (!xfs_scrub_btree_process_error(bs->sc, cur, level + 1, &error) || + !ncur) + return error; + + /* + * If the pointer is null, we shouldn't be able to move the upper + * level pointer anywhere. + */ + if (xfs_btree_ptr_is_null(cur, sibling)) { + if (direction > 0) + error = xfs_btree_increment(ncur, level + 1, &success); + else + error = xfs_btree_decrement(ncur, level + 1, &success); + if (error == 0 && success) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); + error = 0; + goto out; + } + + /* Increment upper level pointer. */ + if (direction > 0) + error = xfs_btree_increment(ncur, level + 1, &success); + else + error = xfs_btree_decrement(ncur, level + 1, &success); + if (!xfs_scrub_btree_process_error(bs->sc, cur, level + 1, &error)) + goto out; + if (!success) { + xfs_scrub_btree_set_corrupt(bs->sc, cur, level + 1); + goto out; + } + + /* Compare upper level pointer to sibling pointer. */ + pblock = xfs_btree_get_block(ncur, level + 1, &pbp); + pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock); + if (!xfs_scrub_btree_ptr_ok(bs, level + 1, pp)) + goto out; + + if (xfs_btree_diff_two_ptrs(cur, pp, sibling)) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); +out: + xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR); + return error; +} + +/* Check the siblings of a btree block. */ +STATIC int +xfs_scrub_btree_block_check_siblings( + struct xfs_scrub_btree *bs, + struct xfs_btree_block *block) +{ + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_ptr leftsib; + union xfs_btree_ptr rightsib; + int level; + int error = 0; + + xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB); + xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB); + level = xfs_btree_get_level(block); + + /* Root block should never have siblings. */ + if (level == cur->bc_nlevels - 1) { + if (!xfs_btree_ptr_is_null(cur, &leftsib) || + !xfs_btree_ptr_is_null(cur, &rightsib)) + xfs_scrub_btree_set_corrupt(bs->sc, cur, level); + goto out; + } + + /* + * Does the left & right sibling pointers match the adjacent + * parent level pointers? + * (These function absorbs error codes for us.) + */ + error = xfs_scrub_btree_block_check_sibling(bs, level, -1, &leftsib); + if (error) + return error; + error = xfs_scrub_btree_block_check_sibling(bs, level, 1, &rightsib); + if (error) + return error; +out: + return error; +} + +/* + * Grab and scrub a btree block given a btree pointer. Returns block + * and buffer pointers (if applicable) if they're ok to use. + */ +STATIC int +xfs_scrub_btree_get_block( + struct xfs_scrub_btree *bs, + int level, + union xfs_btree_ptr *pp, + struct xfs_btree_block **pblock, + struct xfs_buf **pbp) +{ + void *failed_at; + int error; + + *pblock = NULL; + *pbp = NULL; + + error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock); + if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, level, &error) || + !*pblock) + return error; + + xfs_btree_get_block(bs->cur, level, pbp); + if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) + failed_at = __xfs_btree_check_lblock(bs->cur, *pblock, + level, *pbp); + else + failed_at = __xfs_btree_check_sblock(bs->cur, *pblock, + level, *pbp); + if (failed_at) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level); + return 0; + } + + /* + * Check the block's siblings; this function absorbs error codes + * for us. + */ + return xfs_scrub_btree_block_check_siblings(bs, *pblock); +} + +/* + * Check that the low and high keys of this block match the keys stored + * in the parent block. + */ +STATIC void +xfs_scrub_btree_block_keys( + struct xfs_scrub_btree *bs, + int level, + struct xfs_btree_block *block) +{ + union xfs_btree_key block_keys; + struct xfs_btree_cur *cur = bs->cur; + union xfs_btree_key *high_bk; + union xfs_btree_key *parent_keys; + union xfs_btree_key *high_pk; + struct xfs_btree_block *parent_block; + struct xfs_buf *bp; + + if (level >= cur->bc_nlevels - 1) + return; + + /* Calculate the keys for this block. */ + xfs_btree_get_keys(cur, block, &block_keys); + + /* Obtain the parent's copy of the keys for this block. */ + parent_block = xfs_btree_get_block(cur, level + 1, &bp); + parent_keys = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], + parent_block); + + if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, 1); + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Get high keys */ + high_bk = xfs_btree_high_key_from_key(cur, &block_keys); + high_pk = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], + parent_block); + + if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0) + xfs_scrub_btree_set_corrupt(bs->sc, cur, 1); +} + +/* + * Visit all nodes and leaves of a btree. Check that all pointers and + * records are in order, that the keys reflect the records, and use a callback + * so that the caller can verify individual records. + */ +int +xfs_scrub_btree( + struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, + xfs_scrub_btree_rec_fn scrub_fn, + struct xfs_owner_info *oinfo, + void *private) +{ + struct xfs_scrub_btree bs = { NULL }; + union xfs_btree_ptr ptr; + union xfs_btree_ptr *pp; + union xfs_btree_rec *recp; + struct xfs_btree_block *block; + int level; + struct xfs_buf *bp; + int i; + int error = 0; + + /* Initialize scrub state */ + bs.cur = cur; + bs.scrub_rec = scrub_fn; + bs.oinfo = oinfo; + bs.firstrec = true; + bs.private = private; + bs.sc = sc; + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) + bs.firstkey[i] = true; + INIT_LIST_HEAD(&bs.to_check); + + /* Don't try to check a tree with a height we can't handle. */ + if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) { + xfs_scrub_btree_set_corrupt(sc, cur, 0); + goto out; + } + + /* + * Load the root of the btree. The helper function absorbs + * error codes for us. + */ + level = cur->bc_nlevels - 1; + cur->bc_ops->init_ptr_from_cur(cur, &ptr); + if (!xfs_scrub_btree_ptr_ok(&bs, cur->bc_nlevels, &ptr)) + goto out; + error = xfs_scrub_btree_get_block(&bs, level, &ptr, &block, &bp); + if (error || !block) + goto out; + + cur->bc_ptrs[level] = 1; + + while (level < cur->bc_nlevels) { + block = xfs_btree_get_block(cur, level, &bp); + + if (level == 0) { + /* End of leaf, pop back towards the root. */ + if (cur->bc_ptrs[level] > + be16_to_cpu(block->bb_numrecs)) { + xfs_scrub_btree_block_keys(&bs, level, block); + if (level < cur->bc_nlevels - 1) + cur->bc_ptrs[level + 1]++; + level++; + continue; + } + + /* Records in order for scrub? */ + xfs_scrub_btree_rec(&bs); + + /* Call out to the record checker. */ + recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); + error = bs.scrub_rec(&bs, recp); + if (error) + break; + if (xfs_scrub_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + break; + + cur->bc_ptrs[level]++; + continue; + } + + /* End of node, pop back towards the root. */ + if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) { + xfs_scrub_btree_block_keys(&bs, level, block); + if (level < cur->bc_nlevels - 1) + cur->bc_ptrs[level + 1]++; + level++; + continue; + } + + /* Keys in order for scrub? */ + xfs_scrub_btree_key(&bs, level); + + /* Drill another level deeper. */ + pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block); + if (!xfs_scrub_btree_ptr_ok(&bs, level, pp)) { + cur->bc_ptrs[level]++; + continue; + } + level--; + error = xfs_scrub_btree_get_block(&bs, level, pp, &block, &bp); + if (error || !block) + goto out; + + cur->bc_ptrs[level] = 1; + } + +out: + return error; +} diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h new file mode 100644 index 000000000000..4de825a626d1 --- /dev/null +++ b/fs/xfs/scrub/btree.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_BTREE_H__ +#define __XFS_SCRUB_BTREE_H__ + +/* btree scrub */ + +/* Check for btree operation errors. */ +bool xfs_scrub_btree_process_error(struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, int level, int *error); + +/* Check for btree corruption. */ +void xfs_scrub_btree_set_corrupt(struct xfs_scrub_context *sc, + struct xfs_btree_cur *cur, int level); + +struct xfs_scrub_btree; +typedef int (*xfs_scrub_btree_rec_fn)( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec); + +struct xfs_scrub_btree { + /* caller-provided scrub state */ + struct xfs_scrub_context *sc; + struct xfs_btree_cur *cur; + xfs_scrub_btree_rec_fn scrub_rec; + struct xfs_owner_info *oinfo; + void *private; + + /* internal scrub state */ + union xfs_btree_rec lastrec; + bool firstrec; + union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS]; + bool firstkey[XFS_BTREE_MAXLEVELS]; + struct list_head to_check; +}; +int xfs_scrub_btree(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + xfs_scrub_btree_rec_fn scrub_fn, + struct xfs_owner_info *oinfo, void *private); + +#endif /* __XFS_SCRUB_BTREE_H__ */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c new file mode 100644 index 000000000000..ac95fe911d96 --- /dev/null +++ b/fs/xfs/scrub/common.c @@ -0,0 +1,574 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_itable.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/btree.h" + +/* Common code for the metadata scrubbers. */ + +/* + * Handling operational errors. + * + * The *_process_error() family of functions are used to process error return + * codes from functions called as part of a scrub operation. + * + * If there's no error, we return true to tell the caller that it's ok + * to move on to the next check in its list. + * + * For non-verifier errors (e.g. ENOMEM) we return false to tell the + * caller that something bad happened, and we preserve *error so that + * the caller can return the *error up the stack to userspace. + * + * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting + * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words, + * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT, + * not via return codes. We return false to tell the caller that + * something bad happened. Since the error has been cleared, the caller + * will (presumably) return that zero and scrubbing will move on to + * whatever's next. + * + * ftrace can be used to record the precise metadata location and the + * approximate code location of the failed operation. + */ + +/* Check for operational errors. */ +bool +xfs_scrub_process_error( + struct xfs_scrub_context *sc, + xfs_agnumber_t agno, + xfs_agblock_t bno, + int *error) +{ + switch (*error) { + case 0: + return true; + case -EDEADLOCK: + /* Used to restart an op with deadlock avoidance. */ + trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + *error = 0; + /* fall through */ + default: + trace_xfs_scrub_op_error(sc, agno, bno, *error, + __return_address); + break; + } + return false; +} + +/* Check for operational errors for a file offset. */ +bool +xfs_scrub_fblock_process_error( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset, + int *error) +{ + switch (*error) { + case 0: + return true; + case -EDEADLOCK: + /* Used to restart an op with deadlock avoidance. */ + trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + *error = 0; + /* fall through */ + default: + trace_xfs_scrub_file_op_error(sc, whichfork, offset, *error, + __return_address); + break; + } + return false; +} + +/* + * Handling scrub corruption/optimization/warning checks. + * + * The *_set_{corrupt,preen,warning}() family of functions are used to + * record the presence of metadata that is incorrect (corrupt), could be + * optimized somehow (preen), or should be flagged for administrative + * review but is not incorrect (warn). + * + * ftrace can be used to record the precise metadata location and + * approximate code location of the failed check. + */ + +/* Record a block which could be optimized. */ +void +xfs_scrub_block_set_preen( + struct xfs_scrub_context *sc, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xfs_scrub_block_preen(sc, bp->b_bn, __return_address); +} + +/* + * Record an inode which could be optimized. The trace data will + * include the block given by bp if bp is given; otherwise it will use + * the block location of the inode record itself. + */ +void +xfs_scrub_ino_set_preen( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xfs_scrub_ino_preen(sc, ino, bp ? bp->b_bn : 0, + __return_address); +} + +/* Record a corrupt block. */ +void +xfs_scrub_block_set_corrupt( + struct xfs_scrub_context *sc, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xfs_scrub_block_error(sc, bp->b_bn, __return_address); +} + +/* + * Record a corrupt inode. The trace data will include the block given + * by bp if bp is given; otherwise it will use the block location of the + * inode record itself. + */ +void +xfs_scrub_ino_set_corrupt( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address); +} + +/* Record corruption in a block indexed by a file fork. */ +void +xfs_scrub_fblock_set_corrupt( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xfs_scrub_fblock_error(sc, whichfork, offset, __return_address); +} + +/* + * Warn about inodes that need administrative review but is not + * incorrect. + */ +void +xfs_scrub_ino_set_warning( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf *bp) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; + trace_xfs_scrub_ino_warning(sc, ino, bp ? bp->b_bn : 0, + __return_address); +} + +/* Warn about a block indexed by a file fork that needs review. */ +void +xfs_scrub_fblock_set_warning( + struct xfs_scrub_context *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; + trace_xfs_scrub_fblock_warning(sc, whichfork, offset, __return_address); +} + +/* Signal an incomplete scrub. */ +void +xfs_scrub_set_incomplete( + struct xfs_scrub_context *sc) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE; + trace_xfs_scrub_incomplete(sc, __return_address); +} + +/* + * AG scrubbing + * + * These helpers facilitate locking an allocation group's header + * buffers, setting up cursors for all btrees that are present, and + * cleaning everything up once we're through. + */ + +/* Decide if we want to return an AG header read failure. */ +static inline bool +want_ag_read_header_failure( + struct xfs_scrub_context *sc, + unsigned int type) +{ + /* Return all AG header read failures when scanning btrees. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF && + sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL && + sc->sm->sm_type != XFS_SCRUB_TYPE_AGI) + return true; + /* + * If we're scanning a given type of AG header, we only want to + * see read failures from that specific header. We'd like the + * other headers to cross-check them, but this isn't required. + */ + if (sc->sm->sm_type == type) + return true; + return false; +} + +/* + * Grab all the headers for an AG. + * + * The headers should be released by xfs_scrub_ag_free, but as a fail + * safe we attach all the buffers we grab to the scrub transaction so + * they'll all be freed when we cancel it. + */ +int +xfs_scrub_ag_read_headers( + struct xfs_scrub_context *sc, + xfs_agnumber_t agno, + struct xfs_buf **agi, + struct xfs_buf **agf, + struct xfs_buf **agfl) +{ + struct xfs_mount *mp = sc->mp; + int error; + + error = xfs_ialloc_read_agi(mp, sc->tp, agno, agi); + if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) + goto out; + + error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, agf); + if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) + goto out; + + error = xfs_alloc_read_agfl(mp, sc->tp, agno, agfl); + if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) + goto out; + +out: + return error; +} + +/* Release all the AG btree cursors. */ +void +xfs_scrub_ag_btcur_free( + struct xfs_scrub_ag *sa) +{ + if (sa->refc_cur) + xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR); + if (sa->rmap_cur) + xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR); + if (sa->fino_cur) + xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR); + if (sa->ino_cur) + xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR); + if (sa->cnt_cur) + xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR); + if (sa->bno_cur) + xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR); + + sa->refc_cur = NULL; + sa->rmap_cur = NULL; + sa->fino_cur = NULL; + sa->ino_cur = NULL; + sa->bno_cur = NULL; + sa->cnt_cur = NULL; +} + +/* Initialize all the btree cursors for an AG. */ +int +xfs_scrub_ag_btcur_init( + struct xfs_scrub_context *sc, + struct xfs_scrub_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t agno = sa->agno; + + if (sa->agf_bp) { + /* Set up a bnobt cursor for cross-referencing. */ + sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + agno, XFS_BTNUM_BNO); + if (!sa->bno_cur) + goto err; + + /* Set up a cntbt cursor for cross-referencing. */ + sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + agno, XFS_BTNUM_CNT); + if (!sa->cnt_cur) + goto err; + } + + /* Set up a inobt cursor for cross-referencing. */ + if (sa->agi_bp) { + sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, + agno, XFS_BTNUM_INO); + if (!sa->ino_cur) + goto err; + } + + /* Set up a finobt cursor for cross-referencing. */ + if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) { + sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, + agno, XFS_BTNUM_FINO); + if (!sa->fino_cur) + goto err; + } + + /* Set up a rmapbt cursor for cross-referencing. */ + if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) { + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + agno); + if (!sa->rmap_cur) + goto err; + } + + /* Set up a refcountbt cursor for cross-referencing. */ + if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) { + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, agno, NULL); + if (!sa->refc_cur) + goto err; + } + + return 0; +err: + return -ENOMEM; +} + +/* Release the AG header context and btree cursors. */ +void +xfs_scrub_ag_free( + struct xfs_scrub_context *sc, + struct xfs_scrub_ag *sa) +{ + xfs_scrub_ag_btcur_free(sa); + if (sa->agfl_bp) { + xfs_trans_brelse(sc->tp, sa->agfl_bp); + sa->agfl_bp = NULL; + } + if (sa->agf_bp) { + xfs_trans_brelse(sc->tp, sa->agf_bp); + sa->agf_bp = NULL; + } + if (sa->agi_bp) { + xfs_trans_brelse(sc->tp, sa->agi_bp); + sa->agi_bp = NULL; + } + sa->agno = NULLAGNUMBER; +} + +/* + * For scrub, grab the AGI and the AGF headers, in that order. Locking + * order requires us to get the AGI before the AGF. We use the + * transaction to avoid deadlocking on crosslinked metadata buffers; + * either the caller passes one in (bmap scrub) or we have to create a + * transaction ourselves. + */ +int +xfs_scrub_ag_init( + struct xfs_scrub_context *sc, + xfs_agnumber_t agno, + struct xfs_scrub_ag *sa) +{ + int error; + + sa->agno = agno; + error = xfs_scrub_ag_read_headers(sc, agno, &sa->agi_bp, + &sa->agf_bp, &sa->agfl_bp); + if (error) + return error; + + return xfs_scrub_ag_btcur_init(sc, sa); +} + +/* Per-scrubber setup functions */ + +/* Set us up with a transaction and an empty context. */ +int +xfs_scrub_setup_fs( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp); +} + +/* Set us up with AG headers and btree cursors. */ +int +xfs_scrub_setup_ag_btree( + struct xfs_scrub_context *sc, + struct xfs_inode *ip, + bool force_log) +{ + struct xfs_mount *mp = sc->mp; + int error; + + /* + * If the caller asks us to checkpont the log, do so. This + * expensive operation should be performed infrequently and only + * as a last resort. Any caller that sets force_log should + * document why they need to do so. + */ + if (force_log) { + error = xfs_scrub_checkpoint_log(mp); + if (error) + return error; + } + + error = xfs_scrub_setup_ag_header(sc, ip); + if (error) + return error; + + return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa); +} + +/* Push everything out of the log onto disk. */ +int +xfs_scrub_checkpoint_log( + struct xfs_mount *mp) +{ + int error; + + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + if (error) + return error; + xfs_ail_push_all_sync(mp->m_ail); + return 0; +} + +/* + * Given an inode and the scrub control structure, grab either the + * inode referenced in the control structure or the inode passed in. + * The inode is not locked. + */ +int +xfs_scrub_get_inode( + struct xfs_scrub_context *sc, + struct xfs_inode *ip_in) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = NULL; + int error; + + /* + * If userspace passed us an AG number or a generation number + * without an inode number, they haven't got a clue so bail out + * immediately. + */ + if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino)) + return -EINVAL; + + /* We want to scan the inode we already had opened. */ + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { + sc->ip = ip_in; + return 0; + } + + /* Look up the inode, see if the generation number matches. */ + if (xfs_internal_inum(mp, sc->sm->sm_ino)) + return -ENOENT; + error = xfs_iget(mp, NULL, sc->sm->sm_ino, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip); + if (error == -ENOENT || error == -EINVAL) { + /* inode doesn't exist... */ + return -ENOENT; + } else if (error) { + trace_xfs_scrub_op_error(sc, + XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), + XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); + return error; + } + if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { + iput(VFS_I(ip)); + return -ENOENT; + } + + sc->ip = ip; + return 0; +} + +/* Set us up to scrub a file's contents. */ +int +xfs_scrub_setup_inode_contents( + struct xfs_scrub_context *sc, + struct xfs_inode *ip, + unsigned int resblks) +{ + struct xfs_mount *mp = sc->mp; + int error; + + error = xfs_scrub_get_inode(sc, ip); + if (error) + return error; + + /* Got the inode, lock it and we're ready to go. */ + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + if (error) + goto out; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + +out: + /* scrub teardown will unlock and release the inode for us */ + return error; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h new file mode 100644 index 000000000000..5c043855570e --- /dev/null +++ b/fs/xfs/scrub/common.h @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_COMMON_H__ +#define __XFS_SCRUB_COMMON_H__ + +/* + * We /could/ terminate a scrub/repair operation early. If we're not + * in a good place to continue (fatal signal, etc.) then bail out. + * Note that we're careful not to make any judgements about *error. + */ +static inline bool +xfs_scrub_should_terminate( + struct xfs_scrub_context *sc, + int *error) +{ + if (fatal_signal_pending(current)) { + if (*error == 0) + *error = -EAGAIN; + return true; + } + return false; +} + +/* + * Grab an empty transaction so that we can re-grab locked buffers if + * one of our btrees turns out to be cyclic. + */ +static inline int +xfs_scrub_trans_alloc( + struct xfs_scrub_metadata *sm, + struct xfs_mount *mp, + struct xfs_trans **tpp) +{ + return xfs_trans_alloc_empty(mp, tpp); +} + +bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno, + xfs_agblock_t bno, int *error); +bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork, + xfs_fileoff_t offset, int *error); + +void xfs_scrub_block_set_preen(struct xfs_scrub_context *sc, + struct xfs_buf *bp); +void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino, + struct xfs_buf *bp); + +void xfs_scrub_block_set_corrupt(struct xfs_scrub_context *sc, + struct xfs_buf *bp); +void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino, + struct xfs_buf *bp); +void xfs_scrub_fblock_set_corrupt(struct xfs_scrub_context *sc, int whichfork, + xfs_fileoff_t offset); + +void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino, + struct xfs_buf *bp); +void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork, + xfs_fileoff_t offset); + +void xfs_scrub_set_incomplete(struct xfs_scrub_context *sc); +int xfs_scrub_checkpoint_log(struct xfs_mount *mp); + +/* Setup functions */ +int xfs_scrub_setup_fs(struct xfs_scrub_context *sc, struct xfs_inode *ip); +int xfs_scrub_setup_ag_header(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_ag_allocbt(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_ag_iallocbt(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_ag_rmapbt(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_ag_refcountbt(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_inode(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_inode_bmap(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_inode_bmap_data(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_directory(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_xattr(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_symlink(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +int xfs_scrub_setup_parent(struct xfs_scrub_context *sc, + struct xfs_inode *ip); +#ifdef CONFIG_XFS_RT +int xfs_scrub_setup_rt(struct xfs_scrub_context *sc, struct xfs_inode *ip); +#else +static inline int +xfs_scrub_setup_rt(struct xfs_scrub_context *sc, struct xfs_inode *ip) +{ + return -ENOENT; +} +#endif +#ifdef CONFIG_XFS_QUOTA +int xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip); +#else +static inline int +xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip) +{ + return -ENOENT; +} +#endif + +void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa); +int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno, + struct xfs_scrub_ag *sa); +int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno, + struct xfs_buf **agi, struct xfs_buf **agf, + struct xfs_buf **agfl); +void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa); +int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc, + struct xfs_scrub_ag *sa); +int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc, + int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno, + void *), + void *priv); + +int xfs_scrub_setup_ag_btree(struct xfs_scrub_context *sc, + struct xfs_inode *ip, bool force_log); +int xfs_scrub_get_inode(struct xfs_scrub_context *sc, struct xfs_inode *ip_in); +int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc, + struct xfs_inode *ip, unsigned int resblks); + +#endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c new file mode 100644 index 000000000000..d94edd93cba8 --- /dev/null +++ b/fs/xfs/scrub/dabtree.c @@ -0,0 +1,591 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr_leaf.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/dabtree.h" + +/* Directory/Attribute Btree */ + +/* + * Check for da btree operation errors. See the section about handling + * operational errors in common.c. + */ +bool +xfs_scrub_da_process_error( + struct xfs_scrub_da_btree *ds, + int level, + int *error) +{ + struct xfs_scrub_context *sc = ds->sc; + + if (*error == 0) + return true; + + switch (*error) { + case -EDEADLOCK: + /* Used to restart an op with deadlock avoidance. */ + trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error); + break; + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Note the badness but don't abort. */ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + *error = 0; + /* fall through */ + default: + trace_xfs_scrub_file_op_error(sc, ds->dargs.whichfork, + xfs_dir2_da_to_db(ds->dargs.geo, + ds->state->path.blk[level].blkno), + *error, __return_address); + break; + } + return false; +} + +/* + * Check for da btree corruption. See the section about handling + * operational errors in common.c. + */ +void +xfs_scrub_da_set_corrupt( + struct xfs_scrub_da_btree *ds, + int level) +{ + struct xfs_scrub_context *sc = ds->sc; + + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + + trace_xfs_scrub_fblock_error(sc, ds->dargs.whichfork, + xfs_dir2_da_to_db(ds->dargs.geo, + ds->state->path.blk[level].blkno), + __return_address); +} + +/* Find an entry at a certain level in a da btree. */ +STATIC void * +xfs_scrub_da_btree_entry( + struct xfs_scrub_da_btree *ds, + int level, + int rec) +{ + char *ents; + struct xfs_da_state_blk *blk; + void *baddr; + + /* Dispatch the entry finding function. */ + blk = &ds->state->path.blk[level]; + baddr = blk->bp->b_addr; + switch (blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + ents = (char *)xfs_attr3_leaf_entryp(baddr); + return ents + (rec * sizeof(struct xfs_attr_leaf_entry)); + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr); + return ents + (rec * sizeof(struct xfs_dir2_leaf_entry)); + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr); + return ents + (rec * sizeof(struct xfs_dir2_leaf_entry)); + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + ents = (char *)ds->dargs.dp->d_ops->node_tree_p(baddr); + return ents + (rec * sizeof(struct xfs_da_node_entry)); + } + + return NULL; +} + +/* Scrub a da btree hash (key). */ +int +xfs_scrub_da_btree_hash( + struct xfs_scrub_da_btree *ds, + int level, + __be32 *hashp) +{ + struct xfs_da_state_blk *blks; + struct xfs_da_node_entry *entry; + xfs_dahash_t hash; + xfs_dahash_t parent_hash; + + /* Is this hash in order? */ + hash = be32_to_cpu(*hashp); + if (hash < ds->hashes[level]) + xfs_scrub_da_set_corrupt(ds, level); + ds->hashes[level] = hash; + + if (level == 0) + return 0; + + /* Is this hash no larger than the parent hash? */ + blks = ds->state->path.blk; + entry = xfs_scrub_da_btree_entry(ds, level - 1, blks[level - 1].index); + parent_hash = be32_to_cpu(entry->hashval); + if (parent_hash < hash) + xfs_scrub_da_set_corrupt(ds, level); + + return 0; +} + +/* + * Check a da btree pointer. Returns true if it's ok to use this + * pointer. + */ +STATIC bool +xfs_scrub_da_btree_ptr_ok( + struct xfs_scrub_da_btree *ds, + int level, + xfs_dablk_t blkno) +{ + if (blkno < ds->lowest || (ds->highest != 0 && blkno >= ds->highest)) { + xfs_scrub_da_set_corrupt(ds, level); + return false; + } + + return true; +} + +/* + * The da btree scrubber can handle leaf1 blocks as a degenerate + * form of leafn blocks. Since the regular da code doesn't handle + * leaf1, we must multiplex the verifiers. + */ +static void +xfs_scrub_da_btree_read_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + bp->b_ops->verify_read(bp); + return; + default: + /* + * xfs_da3_node_buf_ops already know how to handle + * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks. + */ + bp->b_ops = &xfs_da3_node_buf_ops; + bp->b_ops->verify_read(bp); + return; + } +} +static void +xfs_scrub_da_btree_write_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + bp->b_ops->verify_write(bp); + return; + default: + /* + * xfs_da3_node_buf_ops already know how to handle + * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks. + */ + bp->b_ops = &xfs_da3_node_buf_ops; + bp->b_ops->verify_write(bp); + return; + } +} + +static const struct xfs_buf_ops xfs_scrub_da_btree_buf_ops = { + .name = "xfs_scrub_da_btree", + .verify_read = xfs_scrub_da_btree_read_verify, + .verify_write = xfs_scrub_da_btree_write_verify, +}; + +/* Check a block's sibling. */ +STATIC int +xfs_scrub_da_btree_block_check_sibling( + struct xfs_scrub_da_btree *ds, + int level, + int direction, + xfs_dablk_t sibling) +{ + int retval; + int error; + + memcpy(&ds->state->altpath, &ds->state->path, + sizeof(ds->state->altpath)); + + /* + * If the pointer is null, we shouldn't be able to move the upper + * level pointer anywhere. + */ + if (sibling == 0) { + error = xfs_da3_path_shift(ds->state, &ds->state->altpath, + direction, false, &retval); + if (error == 0 && retval == 0) + xfs_scrub_da_set_corrupt(ds, level); + error = 0; + goto out; + } + + /* Move the alternate cursor one block in the direction given. */ + error = xfs_da3_path_shift(ds->state, &ds->state->altpath, + direction, false, &retval); + if (!xfs_scrub_da_process_error(ds, level, &error)) + return error; + if (retval) { + xfs_scrub_da_set_corrupt(ds, level); + return error; + } + + /* Compare upper level pointer to sibling pointer. */ + if (ds->state->altpath.blk[level].blkno != sibling) + xfs_scrub_da_set_corrupt(ds, level); + xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp); +out: + return error; +} + +/* Check a block's sibling pointers. */ +STATIC int +xfs_scrub_da_btree_block_check_siblings( + struct xfs_scrub_da_btree *ds, + int level, + struct xfs_da_blkinfo *hdr) +{ + xfs_dablk_t forw; + xfs_dablk_t back; + int error = 0; + + forw = be32_to_cpu(hdr->forw); + back = be32_to_cpu(hdr->back); + + /* Top level blocks should not have sibling pointers. */ + if (level == 0) { + if (forw != 0 || back != 0) + xfs_scrub_da_set_corrupt(ds, level); + return 0; + } + + /* + * Check back (left) and forw (right) pointers. These functions + * absorb error codes for us. + */ + error = xfs_scrub_da_btree_block_check_sibling(ds, level, 0, back); + if (error) + goto out; + error = xfs_scrub_da_btree_block_check_sibling(ds, level, 1, forw); + +out: + memset(&ds->state->altpath, 0, sizeof(ds->state->altpath)); + return error; +} + +/* Load a dir/attribute block from a btree. */ +STATIC int +xfs_scrub_da_btree_block( + struct xfs_scrub_da_btree *ds, + int level, + xfs_dablk_t blkno) +{ + struct xfs_da_state_blk *blk; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_blkinfo *hdr3; + struct xfs_da_args *dargs = &ds->dargs; + struct xfs_inode *ip = ds->dargs.dp; + xfs_ino_t owner; + int *pmaxrecs; + struct xfs_da3_icnode_hdr nodehdr; + int error = 0; + + blk = &ds->state->path.blk[level]; + ds->state->path.active = level + 1; + + /* Release old block. */ + if (blk->bp) { + xfs_trans_brelse(dargs->trans, blk->bp); + blk->bp = NULL; + } + + /* Check the pointer. */ + blk->blkno = blkno; + if (!xfs_scrub_da_btree_ptr_ok(ds, level, blkno)) + goto out_nobuf; + + /* Read the buffer. */ + error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, -2, + &blk->bp, dargs->whichfork, + &xfs_scrub_da_btree_buf_ops); + if (!xfs_scrub_da_process_error(ds, level, &error)) + goto out_nobuf; + + /* + * We didn't find a dir btree root block, which means that + * there's no LEAF1/LEAFN tree (at least not where it's supposed + * to be), so jump out now. + */ + if (ds->dargs.whichfork == XFS_DATA_FORK && level == 0 && + blk->bp == NULL) + goto out_nobuf; + + /* It's /not/ ok for attr trees not to have a da btree. */ + if (blk->bp == NULL) { + xfs_scrub_da_set_corrupt(ds, level); + goto out_nobuf; + } + + hdr3 = blk->bp->b_addr; + blk->magic = be16_to_cpu(hdr3->hdr.magic); + pmaxrecs = &ds->maxrecs[level]; + + /* We only started zeroing the header on v5 filesystems. */ + if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb) && hdr3->hdr.pad) + xfs_scrub_da_set_corrupt(ds, level); + + /* Check the owner. */ + if (xfs_sb_version_hascrc(&ip->i_mount->m_sb)) { + owner = be64_to_cpu(hdr3->owner); + if (owner != ip->i_ino) + xfs_scrub_da_set_corrupt(ds, level); + } + + /* Check the siblings. */ + error = xfs_scrub_da_btree_block_check_siblings(ds, level, &hdr3->hdr); + if (error) + goto out; + + /* Interpret the buffer. */ + switch (blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_ATTR_LEAF_BUF); + blk->magic = XFS_ATTR_LEAF_MAGIC; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, pmaxrecs); + if (ds->tree_level != 0) + xfs_scrub_da_set_corrupt(ds, level); + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_DIR_LEAFN_BUF); + blk->magic = XFS_DIR2_LEAFN_MAGIC; + blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs); + if (ds->tree_level != 0) + xfs_scrub_da_set_corrupt(ds, level); + break; + case XFS_DIR2_LEAF1_MAGIC: + case XFS_DIR3_LEAF1_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_DIR_LEAF1_BUF); + blk->magic = XFS_DIR2_LEAF1_MAGIC; + blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs); + if (ds->tree_level != 0) + xfs_scrub_da_set_corrupt(ds, level); + break; + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + xfs_trans_buf_set_type(dargs->trans, blk->bp, + XFS_BLFT_DA_NODE_BUF); + blk->magic = XFS_DA_NODE_MAGIC; + node = blk->bp->b_addr; + ip->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = ip->d_ops->node_tree_p(node); + *pmaxrecs = nodehdr.count; + blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval); + if (level == 0) { + if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { + xfs_scrub_da_set_corrupt(ds, level); + goto out_freebp; + } + ds->tree_level = nodehdr.level; + } else { + if (ds->tree_level != nodehdr.level) { + xfs_scrub_da_set_corrupt(ds, level); + goto out_freebp; + } + } + + /* XXX: Check hdr3.pad32 once we know how to fix it. */ + break; + default: + xfs_scrub_da_set_corrupt(ds, level); + goto out_freebp; + } + +out: + return error; +out_freebp: + xfs_trans_brelse(dargs->trans, blk->bp); + blk->bp = NULL; +out_nobuf: + blk->blkno = 0; + return error; +} + +/* Visit all nodes and leaves of a da btree. */ +int +xfs_scrub_da_btree( + struct xfs_scrub_context *sc, + int whichfork, + xfs_scrub_da_btree_rec_fn scrub_fn, + void *private) +{ + struct xfs_scrub_da_btree ds = {}; + struct xfs_mount *mp = sc->mp; + struct xfs_da_state_blk *blks; + struct xfs_da_node_entry *key; + void *rec; + xfs_dablk_t blkno; + int level; + int error; + + /* Skip short format data structures; no btree to scan. */ + if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE) + return 0; + + /* Set up initial da state. */ + ds.dargs.dp = sc->ip; + ds.dargs.whichfork = whichfork; + ds.dargs.trans = sc->tp; + ds.dargs.op_flags = XFS_DA_OP_OKNOENT; + ds.state = xfs_da_state_alloc(); + ds.state->args = &ds.dargs; + ds.state->mp = mp; + ds.sc = sc; + ds.private = private; + if (whichfork == XFS_ATTR_FORK) { + ds.dargs.geo = mp->m_attr_geo; + ds.lowest = 0; + ds.highest = 0; + } else { + ds.dargs.geo = mp->m_dir_geo; + ds.lowest = ds.dargs.geo->leafblk; + ds.highest = ds.dargs.geo->freeblk; + } + blkno = ds.lowest; + level = 0; + + /* Find the root of the da tree, if present. */ + blks = ds.state->path.blk; + error = xfs_scrub_da_btree_block(&ds, level, blkno); + if (error) + goto out_state; + /* + * We didn't find a block at ds.lowest, which means that there's + * no LEAF1/LEAFN tree (at least not where it's supposed to be), + * so jump out now. + */ + if (blks[level].bp == NULL) + goto out_state; + + blks[level].index = 0; + while (level >= 0 && level < XFS_DA_NODE_MAXDEPTH) { + /* Handle leaf block. */ + if (blks[level].magic != XFS_DA_NODE_MAGIC) { + /* End of leaf, pop back towards the root. */ + if (blks[level].index >= ds.maxrecs[level]) { + if (level > 0) + blks[level - 1].index++; + ds.tree_level++; + level--; + continue; + } + + /* Dispatch record scrubbing. */ + rec = xfs_scrub_da_btree_entry(&ds, level, + blks[level].index); + error = scrub_fn(&ds, level, rec); + if (error) + break; + if (xfs_scrub_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + break; + + blks[level].index++; + continue; + } + + + /* End of node, pop back towards the root. */ + if (blks[level].index >= ds.maxrecs[level]) { + if (level > 0) + blks[level - 1].index++; + ds.tree_level++; + level--; + continue; + } + + /* Hashes in order for scrub? */ + key = xfs_scrub_da_btree_entry(&ds, level, blks[level].index); + error = xfs_scrub_da_btree_hash(&ds, level, &key->hashval); + if (error) + goto out; + + /* Drill another level deeper. */ + blkno = be32_to_cpu(key->before); + level++; + ds.tree_level--; + error = xfs_scrub_da_btree_block(&ds, level, blkno); + if (error) + goto out; + if (blks[level].bp == NULL) + goto out; + + blks[level].index = 0; + } + +out: + /* Release all the buffers we're tracking. */ + for (level = 0; level < XFS_DA_NODE_MAXDEPTH; level++) { + if (blks[level].bp == NULL) + continue; + xfs_trans_brelse(sc->tp, blks[level].bp); + blks[level].bp = NULL; + } + +out_state: + xfs_da_state_free(ds.state); + return error; +} diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h new file mode 100644 index 000000000000..d31468d68cef --- /dev/null +++ b/fs/xfs/scrub/dabtree.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_DABTREE_H__ +#define __XFS_SCRUB_DABTREE_H__ + +/* dir/attr btree */ + +struct xfs_scrub_da_btree { + struct xfs_da_args dargs; + xfs_dahash_t hashes[XFS_DA_NODE_MAXDEPTH]; + int maxrecs[XFS_DA_NODE_MAXDEPTH]; + struct xfs_da_state *state; + struct xfs_scrub_context *sc; + void *private; + + /* + * Lowest and highest directory block address in which we expect + * to find dir/attr btree node blocks. For a directory this + * (presumably) means between LEAF_OFFSET and FREE_OFFSET; for + * attributes there is no limit. + */ + xfs_dablk_t lowest; + xfs_dablk_t highest; + + int tree_level; +}; + +typedef int (*xfs_scrub_da_btree_rec_fn)(struct xfs_scrub_da_btree *ds, + int level, void *rec); + +/* Check for da btree operation errors. */ +bool xfs_scrub_da_process_error(struct xfs_scrub_da_btree *ds, int level, int *error); + +/* Check for da btree corruption. */ +void xfs_scrub_da_set_corrupt(struct xfs_scrub_da_btree *ds, int level); + +int xfs_scrub_da_btree_hash(struct xfs_scrub_da_btree *ds, int level, + __be32 *hashp); +int xfs_scrub_da_btree(struct xfs_scrub_context *sc, int whichfork, + xfs_scrub_da_btree_rec_fn scrub_fn, void *private); + +#endif /* __XFS_SCRUB_DABTREE_H__ */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c new file mode 100644 index 000000000000..69e1efdd4019 --- /dev/null +++ b/fs/xfs/scrub/dir.c @@ -0,0 +1,816 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_itable.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ialloc.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/dabtree.h" + +/* Set us up to scrub directories. */ +int +xfs_scrub_setup_directory( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_inode_contents(sc, ip, 0); +} + +/* Directories */ + +/* Scrub a directory entry. */ + +struct xfs_scrub_dir_ctx { + /* VFS fill-directory iterator */ + struct dir_context dir_iter; + + struct xfs_scrub_context *sc; +}; + +/* Check that an inode's mode matches a given DT_ type. */ +STATIC int +xfs_scrub_dir_check_ftype( + struct xfs_scrub_dir_ctx *sdc, + xfs_fileoff_t offset, + xfs_ino_t inum, + int dtype) +{ + struct xfs_mount *mp = sdc->sc->mp; + struct xfs_inode *ip; + int ino_dtype; + int error = 0; + + if (!xfs_sb_version_hasftype(&mp->m_sb)) { + if (dtype != DT_UNKNOWN && dtype != DT_DIR) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, + offset); + goto out; + } + + /* + * Grab the inode pointed to by the dirent. We release the + * inode before we cancel the scrub transaction. Since we're + * don't know a priori that releasing the inode won't trigger + * eofblocks cleanup (which allocates what would be a nested + * transaction), we can't use DONTCACHE here because DONTCACHE + * inodes can trigger immediate inactive cleanup of the inode. + */ + error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip); + if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, + &error)) + goto out; + + /* Convert mode to the DT_* values that dir_emit uses. */ + ino_dtype = xfs_dir3_get_dtype(mp, + xfs_mode_to_ftype(VFS_I(ip)->i_mode)); + if (ino_dtype != dtype) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + iput(VFS_I(ip)); +out: + return error; +} + +/* + * Scrub a single directory entry. + * + * We use the VFS directory iterator (i.e. readdir) to call this + * function for every directory entry in a directory. Once we're here, + * we check the inode number to make sure it's sane, then we check that + * we can look up this filename. Finally, we check the ftype. + */ +STATIC int +xfs_scrub_dir_actor( + struct dir_context *dir_iter, + const char *name, + int namelen, + loff_t pos, + u64 ino, + unsigned type) +{ + struct xfs_mount *mp; + struct xfs_inode *ip; + struct xfs_scrub_dir_ctx *sdc; + struct xfs_name xname; + xfs_ino_t lookup_ino; + xfs_dablk_t offset; + int error = 0; + + sdc = container_of(dir_iter, struct xfs_scrub_dir_ctx, dir_iter); + ip = sdc->sc->ip; + mp = ip->i_mount; + offset = xfs_dir2_db_to_da(mp->m_dir_geo, + xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); + + /* Does this inode number make sense? */ + if (!xfs_verify_dir_ino(mp, ino)) { + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + goto out; + } + + if (!strncmp(".", name, namelen)) { + /* If this is "." then check that the inum matches the dir. */ + if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, + offset); + if (ino != ip->i_ino) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, + offset); + } else if (!strncmp("..", name, namelen)) { + /* + * If this is ".." in the root inode, check that the inum + * matches this dir. + */ + if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, + offset); + if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino) + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, + offset); + } + + /* Verify that we can look up this name by hash. */ + xname.name = name; + xname.len = namelen; + xname.type = XFS_DIR3_FT_UNKNOWN; + + error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); + if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, + &error)) + goto fail_xref; + if (lookup_ino != ino) { + xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + goto out; + } + + /* Verify the file type. This function absorbs error codes. */ + error = xfs_scrub_dir_check_ftype(sdc, offset, lookup_ino, type); + if (error) + goto out; +out: + return error; +fail_xref: + return error; +} + +/* Scrub a directory btree record. */ +STATIC int +xfs_scrub_dir_rec( + struct xfs_scrub_da_btree *ds, + int level, + void *rec) +{ + struct xfs_mount *mp = ds->state->mp; + struct xfs_dir2_leaf_entry *ent = rec; + struct xfs_inode *dp = ds->dargs.dp; + struct xfs_dir2_data_entry *dent; + struct xfs_buf *bp; + xfs_ino_t ino; + xfs_dablk_t rec_bno; + xfs_dir2_db_t db; + xfs_dir2_data_aoff_t off; + xfs_dir2_dataptr_t ptr; + xfs_dahash_t calc_hash; + xfs_dahash_t hash; + unsigned int tag; + int error; + + /* Check the hash of the entry. */ + error = xfs_scrub_da_btree_hash(ds, level, &ent->hashval); + if (error) + goto out; + + /* Valid hash pointer? */ + ptr = be32_to_cpu(ent->address); + if (ptr == 0) + return 0; + + /* Find the directory entry's location. */ + db = xfs_dir2_dataptr_to_db(mp->m_dir_geo, ptr); + off = xfs_dir2_dataptr_to_off(mp->m_dir_geo, ptr); + rec_bno = xfs_dir2_db_to_da(mp->m_dir_geo, db); + + if (rec_bno >= mp->m_dir_geo->leafblk) { + xfs_scrub_da_set_corrupt(ds, level); + goto out; + } + error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, -2, &bp); + if (!xfs_scrub_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno, + &error)) + goto out; + if (!bp) { + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out; + } + + /* Retrieve the entry, sanity check it, and compare hashes. */ + dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off); + ino = be64_to_cpu(dent->inumber); + hash = be32_to_cpu(ent->hashval); + tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent)); + if (!xfs_verify_dir_ino(mp, ino) || tag != off) + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + if (dent->namelen == 0) { + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + goto out_relse; + } + calc_hash = xfs_da_hashname(dent->name, dent->namelen); + if (calc_hash != hash) + xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); + +out_relse: + xfs_trans_brelse(ds->dargs.trans, bp); +out: + return error; +} + +/* + * Is this unused entry either in the bestfree or smaller than all of + * them? We've already checked that the bestfrees are sorted longest to + * shortest, and that there aren't any bogus entries. + */ +STATIC void +xfs_scrub_directory_check_free_entry( + struct xfs_scrub_context *sc, + xfs_dablk_t lblk, + struct xfs_dir2_data_free *bf, + struct xfs_dir2_data_unused *dup) +{ + struct xfs_dir2_data_free *dfp; + unsigned int dup_length; + + dup_length = be16_to_cpu(dup->length); + + /* Unused entry is shorter than any of the bestfrees */ + if (dup_length < be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length)) + return; + + for (dfp = &bf[XFS_DIR2_DATA_FD_COUNT - 1]; dfp >= bf; dfp--) + if (dup_length == be16_to_cpu(dfp->length)) + return; + + /* Unused entry should be in the bestfrees but wasn't found. */ + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +} + +/* Check free space info in a directory data block. */ +STATIC int +xfs_scrub_directory_data_bestfree( + struct xfs_scrub_context *sc, + xfs_dablk_t lblk, + bool is_block) +{ + struct xfs_dir2_data_unused *dup; + struct xfs_dir2_data_free *dfp; + struct xfs_buf *bp; + struct xfs_dir2_data_free *bf; + struct xfs_mount *mp = sc->mp; + const struct xfs_dir_ops *d_ops; + char *ptr; + char *endptr; + u16 tag; + unsigned int nr_bestfrees = 0; + unsigned int nr_frees = 0; + unsigned int smallest_bestfree; + int newlen; + int offset; + int error; + + d_ops = sc->ip->d_ops; + + if (is_block) { + /* dir block format */ + if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + error = xfs_dir3_block_read(sc->tp, sc->ip, &bp); + } else { + /* dir data format */ + error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, -1, &bp); + } + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + goto out; + + /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ + + /* Do the bestfrees correspond to actual free space? */ + bf = d_ops->data_bestfree_p(bp->b_addr); + smallest_bestfree = UINT_MAX; + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { + offset = be16_to_cpu(dfp->offset); + if (offset == 0) + continue; + if (offset >= mp->m_dir_geo->blksize) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + dup = (struct xfs_dir2_data_unused *)(bp->b_addr + offset); + tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); + + /* bestfree doesn't match the entry it points at? */ + if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) || + be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) || + tag != ((char *)dup - (char *)bp->b_addr)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + + /* bestfree records should be ordered largest to smallest */ + if (smallest_bestfree < be16_to_cpu(dfp->length)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + + smallest_bestfree = be16_to_cpu(dfp->length); + nr_bestfrees++; + } + + /* Make sure the bestfrees are actually the best free spaces. */ + ptr = (char *)d_ops->data_entry_p(bp->b_addr); + if (is_block) { + struct xfs_dir2_block_tail *btp; + + btp = xfs_dir2_block_tail_p(mp->m_dir_geo, bp->b_addr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + } else + endptr = (char *)bp->b_addr + BBTOB(bp->b_length); + + /* Iterate the entries, stopping when we hit or go past the end. */ + while (ptr < endptr) { + dup = (struct xfs_dir2_data_unused *)ptr; + /* Skip real entries */ + if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) { + struct xfs_dir2_data_entry *dep; + + dep = (struct xfs_dir2_data_entry *)ptr; + newlen = d_ops->data_entsize(dep->namelen); + if (newlen <= 0) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + lblk); + goto out_buf; + } + ptr += newlen; + continue; + } + + /* Spot check this free entry */ + tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); + if (tag != ((char *)dup - (char *)bp->b_addr)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + + /* + * Either this entry is a bestfree or it's smaller than + * any of the bestfrees. + */ + xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup); + + /* Move on. */ + newlen = be16_to_cpu(dup->length); + if (newlen <= 0) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } + ptr += newlen; + if (ptr <= endptr) + nr_frees++; + } + + /* We're required to fill all the space. */ + if (ptr != endptr) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + + /* Did we see at least as many free slots as there are bestfrees? */ + if (nr_frees < nr_bestfrees) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +out_buf: + xfs_trans_brelse(sc->tp, bp); +out: + return error; +} + +/* + * Does the free space length in the free space index block ($len) match + * the longest length in the directory data block's bestfree array? + * Assume that we've already checked that the data block's bestfree + * array is in order. + */ +STATIC void +xfs_scrub_directory_check_freesp( + struct xfs_scrub_context *sc, + xfs_dablk_t lblk, + struct xfs_buf *dbp, + unsigned int len) +{ + struct xfs_dir2_data_free *dfp; + + dfp = sc->ip->d_ops->data_bestfree_p(dbp->b_addr); + + if (len != be16_to_cpu(dfp->length)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + + if (len > 0 && be16_to_cpu(dfp->offset) == 0) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +} + +/* Check free space info in a directory leaf1 block. */ +STATIC int +xfs_scrub_directory_leaf1_bestfree( + struct xfs_scrub_context *sc, + struct xfs_da_args *args, + xfs_dablk_t lblk) +{ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir2_leaf_tail *ltp; + struct xfs_dir2_leaf *leaf; + struct xfs_buf *dbp; + struct xfs_buf *bp; + const struct xfs_dir_ops *d_ops = sc->ip->d_ops; + struct xfs_da_geometry *geo = sc->mp->m_dir_geo; + __be16 *bestp; + __u16 best; + __u32 hash; + __u32 lasthash = 0; + __u32 bestcount; + unsigned int stale = 0; + int i; + int error; + + /* Read the free space block. */ + error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + goto out; + + leaf = bp->b_addr; + d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = d_ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); + bestcount = be32_to_cpu(ltp->bestcount); + bestp = xfs_dir2_leaf_bests_p(ltp); + + if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { + struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; + + if (hdr3->pad != cpu_to_be32(0)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + } + + /* + * There should be as many bestfree slots as there are dir data + * blocks that can fit under i_size. + */ + if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_d.di_size)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* Is the leaf count even remotely sane? */ + if (leafhdr.count > d_ops->leaf_max_ents(geo)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* Leaves and bests don't overlap in leaf format. */ + if ((char *)&ents[leafhdr.count] > (char *)bestp) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* Check hash value order, count stale entries. */ + for (i = 0; i < leafhdr.count; i++) { + hash = be32_to_cpu(ents[i].hashval); + if (i > 0 && lasthash > hash) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + lasthash = hash; + if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + if (leafhdr.stale != stale) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + + /* Check all the bestfree entries. */ + for (i = 0; i < bestcount; i++, bestp++) { + best = be16_to_cpu(*bestp); + if (best == NULLDATAOFF) + continue; + error = xfs_dir3_data_read(sc->tp, sc->ip, + i * args->geo->fsbcount, -1, &dbp); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, + &error)) + continue; + xfs_scrub_directory_check_freesp(sc, lblk, dbp, best); + xfs_trans_brelse(sc->tp, dbp); + } +out: + return error; +} + +/* Check free space info in a directory freespace block. */ +STATIC int +xfs_scrub_directory_free_bestfree( + struct xfs_scrub_context *sc, + struct xfs_da_args *args, + xfs_dablk_t lblk) +{ + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_buf *dbp; + struct xfs_buf *bp; + __be16 *bestp; + __u16 best; + unsigned int stale = 0; + int i; + int error; + + /* Read the free space block */ + error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + goto out; + + if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + if (hdr3->pad != cpu_to_be32(0)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + } + + /* Check all the entries. */ + sc->ip->d_ops->free_hdr_from_disk(&freehdr, bp->b_addr); + bestp = sc->ip->d_ops->free_bests_p(bp->b_addr); + for (i = 0; i < freehdr.nvalid; i++, bestp++) { + best = be16_to_cpu(*bestp); + if (best == NULLDATAOFF) { + stale++; + continue; + } + error = xfs_dir3_data_read(sc->tp, sc->ip, + (freehdr.firstdb + i) * args->geo->fsbcount, + -1, &dbp); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, + &error)) + continue; + xfs_scrub_directory_check_freesp(sc, lblk, dbp, best); + xfs_trans_brelse(sc->tp, dbp); + } + + if (freehdr.nused + stale != freehdr.nvalid) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); +out: + return error; +} + +/* Check free space information in directories. */ +STATIC int +xfs_scrub_directory_blocks( + struct xfs_scrub_context *sc) +{ + struct xfs_bmbt_irec got; + struct xfs_da_args args; + struct xfs_ifork *ifp; + struct xfs_mount *mp = sc->mp; + xfs_fileoff_t leaf_lblk; + xfs_fileoff_t free_lblk; + xfs_fileoff_t lblk; + struct xfs_iext_cursor icur; + xfs_dablk_t dabno; + bool found; + int is_block = 0; + int error; + + /* Ignore local format directories. */ + if (sc->ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + sc->ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + return 0; + + ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET); + leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET); + free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); + + /* Is this a block dir? */ + args.dp = sc->ip; + args.geo = mp->m_dir_geo; + args.trans = sc->tp; + error = xfs_dir2_isblock(&args, &is_block); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) + goto out; + + /* Iterate all the data extents in the directory... */ + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + while (found) { + /* Block directories only have a single block at offset 0. */ + if (is_block && + (got.br_startoff > 0 || + got.br_blockcount != args.geo->fsbcount)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + got.br_startoff); + break; + } + + /* No more data blocks... */ + if (got.br_startoff >= leaf_lblk) + break; + + /* + * Check each data block's bestfree data. + * + * Iterate all the fsbcount-aligned block offsets in + * this directory. The directory block reading code is + * smart enough to do its own bmap lookups to handle + * discontiguous directory blocks. When we're done + * with the extent record, re-query the bmap at the + * next fsbcount-aligned offset to avoid redundant + * block checks. + */ + for (lblk = roundup((xfs_dablk_t)got.br_startoff, + args.geo->fsbcount); + lblk < got.br_startoff + got.br_blockcount; + lblk += args.geo->fsbcount) { + error = xfs_scrub_directory_data_bestfree(sc, lblk, + is_block); + if (error) + goto out; + } + dabno = got.br_startoff + got.br_blockcount; + lblk = roundup(dabno, args.geo->fsbcount); + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Look for a leaf1 block, which has free info. */ + if (xfs_iext_lookup_extent(sc->ip, ifp, leaf_lblk, &icur, &got) && + got.br_startoff == leaf_lblk && + got.br_blockcount == args.geo->fsbcount && + !xfs_iext_next_extent(ifp, &icur, &got)) { + if (is_block) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + error = xfs_scrub_directory_leaf1_bestfree(sc, &args, + leaf_lblk); + if (error) + goto out; + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Scan for free blocks */ + lblk = free_lblk; + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + while (found) { + /* + * Dirs can't have blocks mapped above 2^32. + * Single-block dirs shouldn't even be here. + */ + lblk = got.br_startoff; + if (lblk & ~0xFFFFFFFFULL) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + if (is_block) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out; + } + + /* + * Check each dir free block's bestfree data. + * + * Iterate all the fsbcount-aligned block offsets in + * this directory. The directory block reading code is + * smart enough to do its own bmap lookups to handle + * discontiguous directory blocks. When we're done + * with the extent record, re-query the bmap at the + * next fsbcount-aligned offset to avoid redundant + * block checks. + */ + for (lblk = roundup((xfs_dablk_t)got.br_startoff, + args.geo->fsbcount); + lblk < got.br_startoff + got.br_blockcount; + lblk += args.geo->fsbcount) { + error = xfs_scrub_directory_free_bestfree(sc, &args, + lblk); + if (error) + goto out; + } + dabno = got.br_startoff + got.br_blockcount; + lblk = roundup(dabno, args.geo->fsbcount); + found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); + } +out: + return error; +} + +/* Scrub a whole directory. */ +int +xfs_scrub_directory( + struct xfs_scrub_context *sc) +{ + struct xfs_scrub_dir_ctx sdc = { + .dir_iter.actor = xfs_scrub_dir_actor, + .dir_iter.pos = 0, + .sc = sc, + }; + size_t bufsize; + loff_t oldpos; + int error = 0; + + if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) + return -ENOENT; + + /* Plausible size? */ + if (sc->ip->i_d.di_size < xfs_dir2_sf_hdr_size(0)) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + goto out; + } + + /* Check directory tree structure */ + error = xfs_scrub_da_btree(sc, XFS_DATA_FORK, xfs_scrub_dir_rec, NULL); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return error; + + /* Check the freespace. */ + error = xfs_scrub_directory_blocks(sc); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return error; + + /* + * Check that every dirent we see can also be looked up by hash. + * Userspace usually asks for a 32k buffer, so we will too. + */ + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, + sc->ip->i_d.di_size); + + /* + * Look up every name in this directory by hash. + * + * Use the xfs_readdir function to call xfs_scrub_dir_actor on + * every directory entry in this directory. In _actor, we check + * the name, inode number, and ftype (if applicable) of the + * entry. xfs_readdir uses the VFS filldir functions to provide + * iteration context. + * + * The VFS grabs a read or write lock via i_rwsem before it reads + * or writes to a directory. If we've gotten this far we've + * already obtained IOLOCK_EXCL, which (since 4.10) is the same as + * getting a write lock on i_rwsem. Therefore, it is safe for us + * to drop the ILOCK here in order to reuse the _readdir and + * _dir_lookup routines, which do their own ILOCK locking. + */ + oldpos = 0; + sc->ilock_flags &= ~XFS_ILOCK_EXCL; + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); + while (true) { + error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, + &error)) + goto out; + if (oldpos == sdc.dir_iter.pos) + break; + oldpos = sdc.dir_iter.pos; + } + +out: + return error; +} diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c new file mode 100644 index 000000000000..496d6f2fbb9e --- /dev/null +++ b/fs/xfs/scrub/ialloc.c @@ -0,0 +1,337 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* + * Set us up to scrub inode btrees. + * If we detect a discrepancy between the inobt and the inode, + * try again after forcing logged inode cores out to disk. + */ +int +xfs_scrub_setup_ag_iallocbt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_ag_btree(sc, ip, sc->try_harder); +} + +/* Inode btree scrubber. */ + +/* Is this chunk worth checking? */ +STATIC bool +xfs_scrub_iallocbt_chunk( + struct xfs_scrub_btree *bs, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + xfs_extlen_t len) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agblock_t bno; + + bno = XFS_AGINO_TO_AGBNO(mp, agino); + if (bno + len <= bno || + !xfs_verify_agbno(mp, agno, bno) || + !xfs_verify_agbno(mp, agno, bno + len - 1)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + return true; +} + +/* Count the number of free inodes. */ +static unsigned int +xfs_scrub_iallocbt_freecount( + xfs_inofree_t freemask) +{ + BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64)); + return hweight64(freemask); +} + +/* Check a particular inode with ir_free. */ +STATIC int +xfs_scrub_iallocbt_check_cluster_freemask( + struct xfs_scrub_btree *bs, + xfs_ino_t fsino, + xfs_agino_t chunkino, + xfs_agino_t clusterino, + struct xfs_inobt_rec_incore *irec, + struct xfs_buf *bp) +{ + struct xfs_dinode *dip; + struct xfs_mount *mp = bs->cur->bc_mp; + bool inode_is_free = false; + bool freemask_ok; + bool inuse; + int error = 0; + + if (xfs_scrub_should_terminate(bs->sc, &error)) + return error; + + dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || + (dip->di_version >= 3 && + be64_to_cpu(dip->di_ino) != fsino + clusterino)) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + goto out; + } + + if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)) + inode_is_free = true; + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, + fsino + clusterino, &inuse); + if (error == -ENODATA) { + /* Not cached, just read the disk buffer */ + freemask_ok = inode_is_free ^ !!(dip->di_mode); + if (!bs->sc->try_harder && !freemask_ok) + return -EDEADLOCK; + } else if (error < 0) { + /* + * Inode is only half assembled, or there was an IO error, + * or the verifier failed, so don't bother trying to check. + * The inode scrubber can deal with this. + */ + goto out; + } else { + /* Inode is all there. */ + freemask_ok = inode_is_free ^ inuse; + } + if (!freemask_ok) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); +out: + return 0; +} + +/* Make sure the free mask is consistent with what the inodes think. */ +STATIC int +xfs_scrub_iallocbt_check_freemask( + struct xfs_scrub_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_owner_info oinfo; + struct xfs_imap imap; + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_dinode *dip; + struct xfs_buf *bp; + xfs_ino_t fsino; + xfs_agino_t nr_inodes; + xfs_agino_t agino; + xfs_agino_t chunkino; + xfs_agino_t clusterino; + xfs_agblock_t agbno; + int blks_per_cluster; + uint16_t holemask; + uint16_t ir_holemask; + int error = 0; + + /* Make sure the freemask matches the inode records. */ + blks_per_cluster = xfs_icluster_size_fsb(mp); + nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); + + for (agino = irec->ir_startino; + agino < irec->ir_startino + XFS_INODES_PER_CHUNK; + agino += blks_per_cluster * mp->m_sb.sb_inopblock) { + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + chunkino = agino - irec->ir_startino; + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + + /* Compute the holemask mask for this cluster. */ + for (clusterino = 0, holemask = 0; clusterino < nr_inodes; + clusterino += XFS_INODES_PER_HOLEMASK_BIT) + holemask |= XFS_INOBT_MASK((chunkino + clusterino) / + XFS_INODES_PER_HOLEMASK_BIT); + + /* The whole cluster must be a hole or not a hole. */ + ir_holemask = (irec->ir_holemask & holemask); + if (ir_holemask != holemask && ir_holemask != 0) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + continue; + } + + /* If any part of this is a hole, skip it. */ + if (ir_holemask) + continue; + + /* Grab the inode cluster buffer. */ + imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, + agbno); + imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap.im_boffset = 0; + + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, + &dip, &bp, 0, 0); + if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error)) + continue; + + /* Which inodes are free? */ + for (clusterino = 0; clusterino < nr_inodes; clusterino++) { + error = xfs_scrub_iallocbt_check_cluster_freemask(bs, + fsino, chunkino, clusterino, irec, bp); + if (error) { + xfs_trans_brelse(bs->cur->bc_tp, bp); + return error; + } + } + + xfs_trans_brelse(bs->cur->bc_tp, bp); + } + + return error; +} + +/* Scrub an inobt/finobt record. */ +STATIC int +xfs_scrub_iallocbt_rec( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_inobt_rec_incore irec; + uint64_t holes; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agino_t agino; + xfs_agblock_t agbno; + xfs_extlen_t len; + int holecount; + int i; + int error = 0; + unsigned int real_freecount; + uint16_t holemask; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + + if (irec.ir_count > XFS_INODES_PER_CHUNK || + irec.ir_freecount > XFS_INODES_PER_CHUNK) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + real_freecount = irec.ir_freecount + + (XFS_INODES_PER_CHUNK - irec.ir_count); + if (real_freecount != xfs_scrub_iallocbt_freecount(irec.ir_free)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + agino = irec.ir_startino; + /* Record has to be properly aligned within the AG. */ + if (!xfs_verify_agino(mp, agno, agino) || + !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + goto out; + } + + /* Make sure this record is aligned to cluster and inoalignmnt size. */ + agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); + if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) || + (agbno & (xfs_icluster_size_fsb(mp) - 1))) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + /* Handle non-sparse inodes */ + if (!xfs_inobt_issparse(irec.ir_holemask)) { + len = XFS_B_TO_FSB(mp, + XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize); + if (irec.ir_count != XFS_INODES_PER_CHUNK) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) + goto out; + goto check_freemask; + } + + /* Check each chunk of a sparse inode cluster. */ + holemask = irec.ir_holemask; + holecount = 0; + len = XFS_B_TO_FSB(mp, + XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize); + holes = ~xfs_inobt_irec_to_allocmask(&irec); + if ((holes & irec.ir_free) != holes || + irec.ir_freecount > irec.ir_count) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) { + if (holemask & 1) + holecount += XFS_INODES_PER_HOLEMASK_BIT; + else if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) + break; + holemask >>= 1; + agino += XFS_INODES_PER_HOLEMASK_BIT; + } + + if (holecount > XFS_INODES_PER_CHUNK || + holecount + irec.ir_count != XFS_INODES_PER_CHUNK) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + +check_freemask: + error = xfs_scrub_iallocbt_check_freemask(bs, &irec); + if (error) + goto out; + +out: + return error; +} + +/* Scrub the inode btrees for some AG. */ +STATIC int +xfs_scrub_iallocbt( + struct xfs_scrub_context *sc, + xfs_btnum_t which) +{ + struct xfs_btree_cur *cur; + struct xfs_owner_info oinfo; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); + cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; + return xfs_scrub_btree(sc, cur, xfs_scrub_iallocbt_rec, &oinfo, NULL); +} + +int +xfs_scrub_inobt( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_iallocbt(sc, XFS_BTNUM_INO); +} + +int +xfs_scrub_finobt( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_iallocbt(sc, XFS_BTNUM_FINO); +} diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c new file mode 100644 index 000000000000..637b7a892313 --- /dev/null +++ b/fs/xfs/scrub/inode.c @@ -0,0 +1,611 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_ialloc.h" +#include "xfs_da_format.h" +#include "xfs_reflink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * Grab total control of the inode metadata. It doesn't matter here if + * the file data is still changing; exclusive access to the metadata is + * the goal. + */ +int +xfs_scrub_setup_inode( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = sc->mp; + int error; + + /* + * Try to get the inode. If the verifiers fail, we try again + * in raw mode. + */ + error = xfs_scrub_get_inode(sc, ip); + switch (error) { + case 0: + break; + case -EFSCORRUPTED: + case -EFSBADCRC: + return 0; + default: + return error; + } + + /* Got the inode, lock it and we're ready to go. */ + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + if (error) + goto out; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + +out: + /* scrub teardown will unlock and release the inode for us */ + return error; +} + +/* Inode core */ + +/* + * Validate di_extsize hint. + * + * The rules are documented at xfs_ioctl_setattr_check_extsize(). + * These functions must be kept in sync with each other. + */ +STATIC void +xfs_scrub_inode_extsize( + struct xfs_scrub_context *sc, + struct xfs_buf *bp, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags) +{ + struct xfs_mount *mp = sc->mp; + bool rt_flag; + bool hint_flag; + bool inherit_flag; + uint32_t extsize; + uint32_t extsize_bytes; + uint32_t blocksize_bytes; + + rt_flag = (flags & XFS_DIFLAG_REALTIME); + hint_flag = (flags & XFS_DIFLAG_EXTSIZE); + inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); + extsize = be32_to_cpu(dip->di_extsize); + extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize); + + if (rt_flag) + blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + else + blocksize_bytes = mp->m_sb.sb_blocksize; + + if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode))) + goto bad; + + if (hint_flag && !S_ISREG(mode)) + goto bad; + + if (inherit_flag && !S_ISDIR(mode)) + goto bad; + + if ((hint_flag || inherit_flag) && extsize == 0) + goto bad; + + if (!(hint_flag || inherit_flag) && extsize != 0) + goto bad; + + if (extsize_bytes % blocksize_bytes) + goto bad; + + if (extsize > MAXEXTLEN) + goto bad; + + if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) + goto bad; + + return; +bad: + xfs_scrub_ino_set_corrupt(sc, ino, bp); +} + +/* + * Validate di_cowextsize hint. + * + * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). + * These functions must be kept in sync with each other. + */ +STATIC void +xfs_scrub_inode_cowextsize( + struct xfs_scrub_context *sc, + struct xfs_buf *bp, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + struct xfs_mount *mp = sc->mp; + bool rt_flag; + bool hint_flag; + uint32_t extsize; + uint32_t extsize_bytes; + + rt_flag = (flags & XFS_DIFLAG_REALTIME); + hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); + extsize = be32_to_cpu(dip->di_cowextsize); + extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize); + + if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb)) + goto bad; + + if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) + goto bad; + + if (hint_flag && extsize == 0) + goto bad; + + if (!hint_flag && extsize != 0) + goto bad; + + if (hint_flag && rt_flag) + goto bad; + + if (extsize_bytes % mp->m_sb.sb_blocksize) + goto bad; + + if (extsize > MAXEXTLEN) + goto bad; + + if (extsize > mp->m_sb.sb_agblocks / 2) + goto bad; + + return; +bad: + xfs_scrub_ino_set_corrupt(sc, ino, bp); +} + +/* Make sure the di_flags make sense for the inode. */ +STATIC void +xfs_scrub_inode_flags( + struct xfs_scrub_context *sc, + struct xfs_buf *bp, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags) +{ + struct xfs_mount *mp = sc->mp; + + if (flags & ~XFS_DIFLAG_ANY) + goto bad; + + /* rt flags require rt device */ + if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) && + !mp->m_rtdev_targp) + goto bad; + + /* new rt bitmap flag only valid for rbmino */ + if ((flags & XFS_DIFLAG_NEWRTBM) && ino != mp->m_sb.sb_rbmino) + goto bad; + + /* directory-only flags */ + if ((flags & (XFS_DIFLAG_RTINHERIT | + XFS_DIFLAG_EXTSZINHERIT | + XFS_DIFLAG_PROJINHERIT | + XFS_DIFLAG_NOSYMLINKS)) && + !S_ISDIR(mode)) + goto bad; + + /* file-only flags */ + if ((flags & (XFS_DIFLAG_REALTIME | FS_XFLAG_EXTSIZE)) && + !S_ISREG(mode)) + goto bad; + + /* filestreams and rt make no sense */ + if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME)) + goto bad; + + return; +bad: + xfs_scrub_ino_set_corrupt(sc, ino, bp); +} + +/* Make sure the di_flags2 make sense for the inode. */ +STATIC void +xfs_scrub_inode_flags2( + struct xfs_scrub_context *sc, + struct xfs_buf *bp, + struct xfs_dinode *dip, + xfs_ino_t ino, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + struct xfs_mount *mp = sc->mp; + + if (flags2 & ~XFS_DIFLAG2_ANY) + goto bad; + + /* reflink flag requires reflink feature */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && + !xfs_sb_version_hasreflink(&mp->m_sb)) + goto bad; + + /* cowextsize flag is checked w.r.t. mode separately */ + + /* file/dir-only flags */ + if ((flags2 & XFS_DIFLAG2_DAX) && !(S_ISREG(mode) || S_ISDIR(mode))) + goto bad; + + /* file-only flags */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode)) + goto bad; + + /* realtime and reflink make no sense, currently */ + if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK)) + goto bad; + + /* dax and reflink make no sense, currently */ + if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK)) + goto bad; + + return; +bad: + xfs_scrub_ino_set_corrupt(sc, ino, bp); +} + +/* Scrub all the ondisk inode fields. */ +STATIC void +xfs_scrub_dinode( + struct xfs_scrub_context *sc, + struct xfs_buf *bp, + struct xfs_dinode *dip, + xfs_ino_t ino) +{ + struct xfs_mount *mp = sc->mp; + size_t fork_recs; + unsigned long long isize; + uint64_t flags2; + uint32_t nextents; + uint16_t flags; + uint16_t mode; + + flags = be16_to_cpu(dip->di_flags); + if (dip->di_version >= 3) + flags2 = be64_to_cpu(dip->di_flags2); + else + flags2 = 0; + + /* di_mode */ + mode = be16_to_cpu(dip->di_mode); + if (mode & ~(S_IALLUGO | S_IFMT)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* v1/v2 fields */ + switch (dip->di_version) { + case 1: + /* + * We autoconvert v1 inodes into v2 inodes on writeout, + * so just mark this inode for preening. + */ + xfs_scrub_ino_set_preen(sc, ino, bp); + break; + case 2: + case 3: + if (dip->di_onlink != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + if (dip->di_mode == 0 && sc->ip) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + if (dip->di_projid_hi != 0 && + !xfs_sb_version_hasprojid32bit(&mp->m_sb)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + default: + xfs_scrub_ino_set_corrupt(sc, ino, bp); + return; + } + + /* + * di_uid/di_gid -- -1 isn't invalid, but there's no way that + * userspace could have created that. + */ + if (dip->di_uid == cpu_to_be32(-1U) || + dip->di_gid == cpu_to_be32(-1U)) + xfs_scrub_ino_set_warning(sc, ino, bp); + + /* di_format */ + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + if (!S_ISCHR(mode) && !S_ISBLK(mode) && + !S_ISFIFO(mode) && !S_ISSOCK(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_LOCAL: + if (!S_ISDIR(mode) && !S_ISLNK(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_EXTENTS: + if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_BTREE: + if (!S_ISREG(mode) && !S_ISDIR(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_UUID: + default: + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + } + + /* + * di_size. xfs_dinode_verify checks for things that screw up + * the VFS such as the upper bit being set and zero-length + * symlinks/directories, but we can do more here. + */ + isize = be64_to_cpu(dip->di_size); + if (isize & (1ULL << 63)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* Devices, fifos, and sockets must have zero size */ + if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* Directories can't be larger than the data section size (32G) */ + if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* Symlinks can't be larger than SYMLINK_MAXLEN */ + if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* + * Warn if the running kernel can't handle the kinds of offsets + * needed to deal with the file size. In other words, if the + * pagecache can't cache all the blocks in this file due to + * overly large offsets, flag the inode for admin review. + */ + if (isize >= mp->m_super->s_maxbytes) + xfs_scrub_ino_set_warning(sc, ino, bp); + + /* di_nblocks */ + if (flags2 & XFS_DIFLAG2_REFLINK) { + ; /* nblocks can exceed dblocks */ + } else if (flags & XFS_DIFLAG_REALTIME) { + /* + * nblocks is the sum of data extents (in the rtdev), + * attr extents (in the datadev), and both forks' bmbt + * blocks (in the datadev). This clumsy check is the + * best we can do without cross-referencing with the + * inode forks. + */ + if (be64_to_cpu(dip->di_nblocks) >= + mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } else { + if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } + + xfs_scrub_inode_flags(sc, bp, dip, ino, mode, flags); + + xfs_scrub_inode_extsize(sc, bp, dip, ino, mode, flags); + + /* di_nextents */ + nextents = be32_to_cpu(dip->di_nextents); + fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); + switch (dip->di_format) { + case XFS_DINODE_FMT_EXTENTS: + if (nextents > fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_BTREE: + if (nextents <= fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + default: + if (nextents != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + } + + /* di_forkoff */ + if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (dip->di_anextents != 0 && dip->di_forkoff == 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* di_aformat */ + if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && + dip->di_aformat != XFS_DINODE_FMT_EXTENTS && + dip->di_aformat != XFS_DINODE_FMT_BTREE) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* di_anextents */ + nextents = be16_to_cpu(dip->di_anextents); + fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); + switch (dip->di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + if (nextents > fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_BTREE: + if (nextents <= fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + default: + if (nextents != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } + + if (dip->di_version >= 3) { + xfs_scrub_inode_flags2(sc, bp, dip, ino, mode, flags, flags2); + xfs_scrub_inode_cowextsize(sc, bp, dip, ino, mode, flags, + flags2); + } +} + +/* Map and read a raw inode. */ +STATIC int +xfs_scrub_inode_map_raw( + struct xfs_scrub_context *sc, + xfs_ino_t ino, + struct xfs_buf **bpp, + struct xfs_dinode **dipp) +{ + struct xfs_imap imap; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp = NULL; + struct xfs_dinode *dip; + int error; + + error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED); + if (error == -EINVAL) { + /* + * Inode could have gotten deleted out from under us; + * just forget about it. + */ + error = -ENOENT; + goto out; + } + if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + goto out; + + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, + imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp, + NULL); + if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + goto out; + + /* + * Is this really an inode? We disabled verifiers in the above + * xfs_trans_read_buf call because the inode buffer verifier + * fails on /any/ inode record in the inode cluster with a bad + * magic or version number, not just the one that we're + * checking. Therefore, grab the buffer unconditionally, attach + * the inode verifiers by hand, and run the inode verifier only + * on the one inode we want. + */ + bp->b_ops = &xfs_inode_buf_ops; + dip = xfs_buf_offset(bp, imap.im_boffset); + if (!xfs_dinode_verify(mp, ino, dip) || + !xfs_dinode_good_version(mp, dip->di_version)) { + xfs_scrub_ino_set_corrupt(sc, ino, bp); + goto out_buf; + } + + /* ...and is it the one we asked for? */ + if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) { + error = -ENOENT; + goto out_buf; + } + + *dipp = dip; + *bpp = bp; +out: + return error; +out_buf: + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* Scrub an inode. */ +int +xfs_scrub_inode( + struct xfs_scrub_context *sc) +{ + struct xfs_dinode di; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp = NULL; + struct xfs_dinode *dip; + xfs_ino_t ino; + + bool has_shared; + int error = 0; + + /* Did we get the in-core inode, or are we doing this manually? */ + if (sc->ip) { + ino = sc->ip->i_ino; + xfs_inode_to_disk(sc->ip, &di, 0); + dip = &di; + } else { + /* Map & read inode. */ + ino = sc->sm->sm_ino; + error = xfs_scrub_inode_map_raw(sc, ino, &bp, &dip); + if (error || !bp) + goto out; + } + + xfs_scrub_dinode(sc, bp, dip, ino); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Now let's do the things that require a live inode. */ + if (!sc->ip) + goto out; + + /* + * Does this inode have the reflink flag set but no shared extents? + * Set the preening flag if this is the case. + */ + if (xfs_is_reflink_inode(sc->ip)) { + error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, + &has_shared); + if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + goto out; + if (!has_shared) + xfs_scrub_ino_set_preen(sc, ino, bp); + } + +out: + if (bp) + xfs_trans_brelse(sc->tp, bp); + return error; +} diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c new file mode 100644 index 000000000000..63a25334fc83 --- /dev/null +++ b/fs/xfs/scrub/parent.c @@ -0,0 +1,317 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ialloc.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Set us up to scrub parents. */ +int +xfs_scrub_setup_parent( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_inode_contents(sc, ip, 0); +} + +/* Parent pointers */ + +/* Look for an entry in a parent pointing to this inode. */ + +struct xfs_scrub_parent_ctx { + struct dir_context dc; + xfs_ino_t ino; + xfs_nlink_t nlink; +}; + +/* Look for a single entry in a directory pointing to an inode. */ +STATIC int +xfs_scrub_parent_actor( + struct dir_context *dc, + const char *name, + int namelen, + loff_t pos, + u64 ino, + unsigned type) +{ + struct xfs_scrub_parent_ctx *spc; + + spc = container_of(dc, struct xfs_scrub_parent_ctx, dc); + if (spc->ino == ino) + spc->nlink++; + return 0; +} + +/* Count the number of dentries in the parent dir that point to this inode. */ +STATIC int +xfs_scrub_parent_count_parent_dentries( + struct xfs_scrub_context *sc, + struct xfs_inode *parent, + xfs_nlink_t *nlink) +{ + struct xfs_scrub_parent_ctx spc = { + .dc.actor = xfs_scrub_parent_actor, + .dc.pos = 0, + .ino = sc->ip->i_ino, + .nlink = 0, + }; + size_t bufsize; + loff_t oldpos; + uint lock_mode; + int error = 0; + + /* + * If there are any blocks, read-ahead block 0 as we're almost + * certain to have the next operation be a read there. This is + * how we guarantee that the parent's extent map has been loaded, + * if there is one. + */ + lock_mode = xfs_ilock_data_map_shared(parent); + if (parent->i_d.di_nextents > 0) + error = xfs_dir3_data_readahead(parent, 0, -1); + xfs_iunlock(parent, lock_mode); + if (error) + return error; + + /* + * Iterate the parent dir to confirm that there is + * exactly one entry pointing back to the inode being + * scanned. + */ + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, + parent->i_d.di_size); + oldpos = 0; + while (true) { + error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize); + if (error) + goto out; + if (oldpos == spc.dc.pos) + break; + oldpos = spc.dc.pos; + } + *nlink = spc.nlink; +out: + return error; +} + +/* + * Given the inode number of the alleged parent of the inode being + * scrubbed, try to validate that the parent has exactly one directory + * entry pointing back to the inode being scrubbed. + */ +STATIC int +xfs_scrub_parent_validate( + struct xfs_scrub_context *sc, + xfs_ino_t dnum, + bool *try_again) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_inode *dp = NULL; + xfs_nlink_t expected_nlink; + xfs_nlink_t nlink; + int error = 0; + + *try_again = false; + + /* '..' must not point to ourselves. */ + if (sc->ip->i_ino == dnum) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* + * If we're an unlinked directory, the parent /won't/ have a link + * to us. Otherwise, it should have one link. + */ + expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; + + /* + * Grab this parent inode. We release the inode before we + * cancel the scrub transaction. Since we're don't know a + * priori that releasing the inode won't trigger eofblocks + * cleanup (which allocates what would be a nested transaction) + * if the parent pointer erroneously points to a file, we + * can't use DONTCACHE here because DONTCACHE inodes can trigger + * immediate inactive cleanup of the inode. + */ + error = xfs_iget(mp, sc->tp, dnum, 0, 0, &dp); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out; + if (dp == sc->ip) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out_rele; + } + + /* + * We prefer to keep the inode locked while we lock and search + * its alleged parent for a forward reference. If we can grab + * the iolock, validate the pointers and we're done. We must + * use nowait here to avoid an ABBA deadlock on the parent and + * the child inodes. + */ + if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) { + error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, + &error)) + goto out_unlock; + if (nlink != expected_nlink) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out_unlock; + } + + /* + * The game changes if we get here. We failed to lock the parent, + * so we're going to try to verify both pointers while only holding + * one lock so as to avoid deadlocking with something that's actually + * trying to traverse down the directory tree. + */ + xfs_iunlock(sc->ip, sc->ilock_flags); + sc->ilock_flags = 0; + xfs_ilock(dp, XFS_IOLOCK_SHARED); + + /* Go looking for our dentry. */ + error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_unlock; + + /* Drop the parent lock, relock this inode. */ + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + sc->ilock_flags = XFS_IOLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + + /* + * If we're an unlinked directory, the parent /won't/ have a link + * to us. Otherwise, it should have one link. We have to re-set + * it here because we dropped the lock on sc->ip. + */ + expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; + + /* Look up '..' to see if the inode changed. */ + error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_rele; + + /* Drat, parent changed. Try again! */ + if (dnum != dp->i_ino) { + iput(VFS_I(dp)); + *try_again = true; + return 0; + } + iput(VFS_I(dp)); + + /* + * '..' didn't change, so check that there was only one entry + * for us in the parent. + */ + if (nlink != expected_nlink) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return error; + +out_unlock: + xfs_iunlock(dp, XFS_IOLOCK_SHARED); +out_rele: + iput(VFS_I(dp)); +out: + return error; +} + +/* Scrub a parent pointer. */ +int +xfs_scrub_parent( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + xfs_ino_t dnum; + bool try_again; + int tries = 0; + int error = 0; + + /* + * If we're a directory, check that the '..' link points up to + * a directory that has one entry pointing to us. + */ + if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) + return -ENOENT; + + /* We're not a special inode, are we? */ + if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* + * The VFS grabs a read or write lock via i_rwsem before it reads + * or writes to a directory. If we've gotten this far we've + * already obtained IOLOCK_EXCL, which (since 4.10) is the same as + * getting a write lock on i_rwsem. Therefore, it is safe for us + * to drop the ILOCK here in order to do directory lookups. + */ + sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL); + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL); + + /* Look up '..' */ + error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out; + if (!xfs_verify_dir_ino(mp, dnum)) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* Is this the root dir? Then '..' must point to itself. */ + if (sc->ip == mp->m_rootip) { + if (sc->ip->i_ino != mp->m_sb.sb_rootino || + sc->ip->i_ino != dnum) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + do { + error = xfs_scrub_parent_validate(sc, dnum, &try_again); + if (error) + goto out; + } while (try_again && ++tries < 20); + + /* + * We gave it our best shot but failed, so mark this scrub + * incomplete. Userspace can decide if it wants to try again. + */ + if (try_again && tries == 20) + xfs_scrub_set_incomplete(sc); +out: + return error; +} diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c new file mode 100644 index 000000000000..8e58ba842946 --- /dev/null +++ b/fs/xfs/scrub/quota.c @@ -0,0 +1,304 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dquot.h" +#include "xfs_dquot_item.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Convert a scrub type code to a DQ flag, or return 0 if error. */ +static inline uint +xfs_scrub_quota_to_dqtype( + struct xfs_scrub_context *sc) +{ + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_UQUOTA: + return XFS_DQ_USER; + case XFS_SCRUB_TYPE_GQUOTA: + return XFS_DQ_GROUP; + case XFS_SCRUB_TYPE_PQUOTA: + return XFS_DQ_PROJ; + default: + return 0; + } +} + +/* Set us up to scrub a quota. */ +int +xfs_scrub_setup_quota( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + uint dqtype; + + /* + * If userspace gave us an AG number or inode data, they don't + * know what they're doing. Get out. + */ + if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen) + return -EINVAL; + + dqtype = xfs_scrub_quota_to_dqtype(sc); + if (dqtype == 0) + return -EINVAL; + if (!xfs_this_quota_on(sc->mp, dqtype)) + return -ENOENT; + return 0; +} + +/* Quotas. */ + +/* Scrub the fields in an individual quota item. */ +STATIC void +xfs_scrub_quota_item( + struct xfs_scrub_context *sc, + uint dqtype, + struct xfs_dquot *dq, + xfs_dqid_t id) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_disk_dquot *d = &dq->q_core; + struct xfs_quotainfo *qi = mp->m_quotainfo; + xfs_fileoff_t offset; + unsigned long long bsoft; + unsigned long long isoft; + unsigned long long rsoft; + unsigned long long bhard; + unsigned long long ihard; + unsigned long long rhard; + unsigned long long bcount; + unsigned long long icount; + unsigned long long rcount; + xfs_ino_t fs_icount; + + offset = id * qi->qi_dqperchunk; + + /* + * We fed $id and DQNEXT into the xfs_qm_dqget call, which means + * that the actual dquot we got must either have the same id or + * the next higher id. + */ + if (id > be32_to_cpu(d->d_id)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* Did we get the dquot type we wanted? */ + if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* Check the limits. */ + bhard = be64_to_cpu(d->d_blk_hardlimit); + ihard = be64_to_cpu(d->d_ino_hardlimit); + rhard = be64_to_cpu(d->d_rtb_hardlimit); + + bsoft = be64_to_cpu(d->d_blk_softlimit); + isoft = be64_to_cpu(d->d_ino_softlimit); + rsoft = be64_to_cpu(d->d_rtb_softlimit); + + /* + * Warn if the hard limits are larger than the fs. + * Administrators can do this, though in production this seems + * suspect, which is why we flag it for review. + * + * Complain about corruption if the soft limit is greater than + * the hard limit. + */ + if (bhard > mp->m_sb.sb_dblocks) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (bsoft > bhard) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + if (ihard > mp->m_maxicount) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (isoft > ihard) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + if (rhard > mp->m_sb.sb_rblocks) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (rsoft > rhard) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* Check the resource counts. */ + bcount = be64_to_cpu(d->d_bcount); + icount = be64_to_cpu(d->d_icount); + rcount = be64_to_cpu(d->d_rtbcount); + fs_icount = percpu_counter_sum(&mp->m_icount); + + /* + * Check that usage doesn't exceed physical limits. However, on + * a reflink filesystem we're allowed to exceed physical space + * if there are no quota limits. + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (mp->m_sb.sb_dblocks < bcount) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, + offset); + } else { + if (mp->m_sb.sb_dblocks < bcount) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + offset); + } + if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* + * We can violate the hard limits if the admin suddenly sets a + * lower limit than the actual usage. However, we flag it for + * admin review. + */ + if (id != 0 && bhard != 0 && bcount > bhard) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (id != 0 && ihard != 0 && icount > ihard) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + if (id != 0 && rhard != 0 && rcount > rhard) + xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); +} + +/* Scrub all of a quota type's items. */ +int +xfs_scrub_quota( + struct xfs_scrub_context *sc) +{ + struct xfs_bmbt_irec irec = { 0 }; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_dquot *dq; + xfs_fileoff_t max_dqid_off; + xfs_fileoff_t off = 0; + xfs_dqid_t id = 0; + uint dqtype; + int nimaps; + int error; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return -ENOENT; + + mutex_lock(&qi->qi_quotaofflock); + dqtype = xfs_scrub_quota_to_dqtype(sc); + if (!xfs_this_quota_on(sc->mp, dqtype)) { + error = -ENOENT; + goto out_unlock_quota; + } + + /* Attach to the quota inode and set sc->ip so that reporting works. */ + ip = xfs_quota_inode(sc->mp, dqtype); + sc->ip = ip; + + /* Look for problem extents. */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + goto out_unlock_inode; + } + max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; + while (1) { + if (xfs_scrub_should_terminate(sc, &error)) + break; + + off = irec.br_startoff + irec.br_blockcount; + nimaps = 1; + error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps, + XFS_BMAPI_ENTIRE); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off, + &error)) + goto out_unlock_inode; + if (!nimaps) + break; + if (irec.br_startblock == HOLESTARTBLOCK) + continue; + + /* Check the extent record doesn't point to crap. */ + if (irec.br_startblock + irec.br_blockcount <= + irec.br_startblock) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + irec.br_startoff); + if (!xfs_verify_fsbno(mp, irec.br_startblock) || + !xfs_verify_fsbno(mp, irec.br_startblock + + irec.br_blockcount - 1)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + irec.br_startoff); + + /* + * Unwritten extents or blocks mapped above the highest + * quota id shouldn't happen. + */ + if (isnullstartblock(irec.br_startblock) || + irec.br_startoff > max_dqid_off || + irec.br_startoff + irec.br_blockcount > max_dqid_off + 1) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* Check all the quota items. */ + while (id < ((xfs_dqid_t)-1ULL)) { + if (xfs_scrub_should_terminate(sc, &error)) + break; + + error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT, + &dq); + if (error == -ENOENT) + break; + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, + id * qi->qi_dqperchunk, &error)) + break; + + xfs_scrub_quota_item(sc, dqtype, dq, id); + + id = be32_to_cpu(dq->q_core.d_id) + 1; + xfs_qm_dqput(dq); + if (!id) + break; + } + +out: + /* We set sc->ip earlier, so make sure we clear it now. */ + sc->ip = NULL; +out_unlock_quota: + mutex_unlock(&qi->qi_quotaofflock); + return error; + +out_unlock_inode: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out; +} diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c new file mode 100644 index 000000000000..2f88a8d44bd0 --- /dev/null +++ b/fs/xfs/scrub/refcount.c @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* + * Set us up to scrub reference count btrees. + */ +int +xfs_scrub_setup_ag_refcountbt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_ag_btree(sc, ip, false); +} + +/* Reference count btree scrubber. */ + +/* Scrub a refcountbt record. */ +STATIC int +xfs_scrub_refcountbt_rec( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agblock_t bno; + xfs_extlen_t len; + xfs_nlink_t refcount; + bool has_cowflag; + int error = 0; + + bno = be32_to_cpu(rec->refc.rc_startblock); + len = be32_to_cpu(rec->refc.rc_blockcount); + refcount = be32_to_cpu(rec->refc.rc_refcount); + + /* Only CoW records can have refcount == 1. */ + has_cowflag = (bno & XFS_REFC_COW_START); + if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + /* Check the extent. */ + bno &= ~XFS_REFC_COW_START; + if (bno + len <= bno || + !xfs_verify_agbno(mp, agno, bno) || + !xfs_verify_agbno(mp, agno, bno + len - 1)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (refcount == 0) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + return error; +} + +/* Scrub the refcount btree for some AG. */ +int +xfs_scrub_refcountbt( + struct xfs_scrub_context *sc) +{ + struct xfs_owner_info oinfo; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); + return xfs_scrub_btree(sc, sc->sa.refc_cur, xfs_scrub_refcountbt_rec, + &oinfo, NULL); +} diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c new file mode 100644 index 000000000000..97846c424690 --- /dev/null +++ b/fs/xfs/scrub/rmap.c @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* + * Set us up to scrub reverse mapping btrees. + */ +int +xfs_scrub_setup_ag_rmapbt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_ag_btree(sc, ip, false); +} + +/* Reverse-mapping scrubber. */ + +/* Scrub an rmapbt record. */ +STATIC int +xfs_scrub_rmapbt_rec( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_rmap_irec irec; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + bool non_inode; + bool is_unwritten; + bool is_bmbt; + bool is_attr; + int error; + + error = xfs_rmap_btrec_to_irec(rec, &irec); + if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error)) + goto out; + + /* Check extent. */ + if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (irec.rm_owner == XFS_RMAP_OWN_FS) { + /* + * xfs_verify_agbno returns false for static fs metadata. + * Since that only exists at the start of the AG, validate + * that by hand. + */ + if (irec.rm_startblock != 0 || + irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + } else { + /* + * Otherwise we must point somewhere past the static metadata + * but before the end of the FS. Run the regular check. + */ + if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) || + !xfs_verify_agbno(mp, agno, irec.rm_startblock + + irec.rm_blockcount - 1)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + } + + /* Check flags. */ + non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner); + is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN; + + if (is_bmbt && irec.rm_offset != 0) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (non_inode && irec.rm_offset != 0) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (is_unwritten && (is_bmbt || non_inode || is_attr)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (non_inode && (is_bmbt || is_unwritten || is_attr)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (!non_inode) { + if (!xfs_verify_ino(mp, irec.rm_owner)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + } else { + /* Non-inode owner within the magic values? */ + if (irec.rm_owner <= XFS_RMAP_OWN_MIN || + irec.rm_owner > XFS_RMAP_OWN_FS) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + } +out: + return error; +} + +/* Scrub the rmap btree for some AG. */ +int +xfs_scrub_rmapbt( + struct xfs_scrub_context *sc) +{ + struct xfs_owner_info oinfo; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + return xfs_scrub_btree(sc, sc->sa.rmap_cur, xfs_scrub_rmapbt_rec, + &oinfo, NULL); +} diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c new file mode 100644 index 000000000000..c6fedb698008 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap.c @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Set us up with the realtime metadata locked. */ +int +xfs_scrub_setup_rt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = sc->mp; + int error = 0; + + /* + * If userspace gave us an AG number or inode data, they don't + * know what they're doing. Get out. + */ + if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen) + return -EINVAL; + + error = xfs_scrub_setup_fs(sc, ip); + if (error) + return error; + + sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP; + sc->ip = mp->m_rbmip; + xfs_ilock(sc->ip, sc->ilock_flags); + + return 0; +} + +/* Realtime bitmap. */ + +/* Scrub a free extent record from the realtime bitmap. */ +STATIC int +xfs_scrub_rtbitmap_rec( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_scrub_context *sc = priv; + + if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock || + !xfs_verify_rtbno(sc->mp, rec->ar_startblock) || + !xfs_verify_rtbno(sc->mp, rec->ar_startblock + + rec->ar_blockcount - 1)) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; +} + +/* Scrub the realtime bitmap. */ +int +xfs_scrub_rtbitmap( + struct xfs_scrub_context *sc) +{ + int error; + + error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out; + +out: + return error; +} + +/* Scrub the realtime summary. */ +int +xfs_scrub_rtsummary( + struct xfs_scrub_context *sc) +{ + /* XXX: implement this some day */ + return -ENOENT; +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c new file mode 100644 index 000000000000..9c42c4efd01e --- /dev/null +++ b/fs/xfs/scrub/scrub.c @@ -0,0 +1,392 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_itable.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/scrub.h" +#include "scrub/btree.h" + +/* + * Online Scrub and Repair + * + * Traditionally, XFS (the kernel driver) did not know how to check or + * repair on-disk data structures. That task was left to the xfs_check + * and xfs_repair tools, both of which require taking the filesystem + * offline for a thorough but time consuming examination. Online + * scrub & repair, on the other hand, enables us to check the metadata + * for obvious errors while carefully stepping around the filesystem's + * ongoing operations, locking rules, etc. + * + * Given that most XFS metadata consist of records stored in a btree, + * most of the checking functions iterate the btree blocks themselves + * looking for irregularities. When a record block is encountered, each + * record can be checked for obviously bad values. Record values can + * also be cross-referenced against other btrees to look for potential + * misunderstandings between pieces of metadata. + * + * It is expected that the checkers responsible for per-AG metadata + * structures will lock the AG headers (AGI, AGF, AGFL), iterate the + * metadata structure, and perform any relevant cross-referencing before + * unlocking the AG and returning the results to userspace. These + * scrubbers must not keep an AG locked for too long to avoid tying up + * the block and inode allocators. + * + * Block maps and b-trees rooted in an inode present a special challenge + * because they can involve extents from any AG. The general scrubber + * structure of lock -> check -> xref -> unlock still holds, but AG + * locking order rules /must/ be obeyed to avoid deadlocks. The + * ordering rule, of course, is that we must lock in increasing AG + * order. Helper functions are provided to track which AG headers we've + * already locked. If we detect an imminent locking order violation, we + * can signal a potential deadlock, in which case the scrubber can jump + * out to the top level, lock all the AGs in order, and retry the scrub. + * + * For file data (directories, extended attributes, symlinks) scrub, we + * can simply lock the inode and walk the data. For btree data + * (directories and attributes) we follow the same btree-scrubbing + * strategy outlined previously to check the records. + * + * We use a bit of trickery with transactions to avoid buffer deadlocks + * if there is a cycle in the metadata. The basic problem is that + * travelling down a btree involves locking the current buffer at each + * tree level. If a pointer should somehow point back to a buffer that + * we've already examined, we will deadlock due to the second buffer + * locking attempt. Note however that grabbing a buffer in transaction + * context links the locked buffer to the transaction. If we try to + * re-grab the buffer in the context of the same transaction, we avoid + * the second lock attempt and continue. Between the verifier and the + * scrubber, something will notice that something is amiss and report + * the corruption. Therefore, each scrubber will allocate an empty + * transaction, attach buffers to it, and cancel the transaction at the + * end of the scrub run. Cancelling a non-dirty transaction simply + * unlocks the buffers. + * + * There are four pieces of data that scrub can communicate to + * userspace. The first is the error code (errno), which can be used to + * communicate operational errors in performing the scrub. There are + * also three flags that can be set in the scrub context. If the data + * structure itself is corrupt, the CORRUPT flag will be set. If + * the metadata is correct but otherwise suboptimal, the PREEN flag + * will be set. + */ + +/* + * Scrub probe -- userspace uses this to probe if we're willing to scrub + * or repair a given mountpoint. This will be used by xfs_scrub to + * probe the kernel's abilities to scrub (and repair) the metadata. We + * do this by validating the ioctl inputs from userspace, preparing the + * filesystem for a scrub (or a repair) operation, and immediately + * returning to userspace. Userspace can use the returned errno and + * structure state to decide (in broad terms) if scrub/repair are + * supported by the running kernel. + */ +static int +xfs_scrub_probe( + struct xfs_scrub_context *sc) +{ + int error = 0; + + if (sc->sm->sm_ino || sc->sm->sm_agno) + return -EINVAL; + if (xfs_scrub_should_terminate(sc, &error)) + return error; + + return 0; +} + +/* Scrub setup and teardown */ + +/* Free all the resources and finish the transactions. */ +STATIC int +xfs_scrub_teardown( + struct xfs_scrub_context *sc, + struct xfs_inode *ip_in, + int error) +{ + xfs_scrub_ag_free(sc, &sc->sa); + if (sc->tp) { + xfs_trans_cancel(sc->tp); + sc->tp = NULL; + } + if (sc->ip) { + xfs_iunlock(sc->ip, sc->ilock_flags); + if (sc->ip != ip_in && + !xfs_internal_inum(sc->mp, sc->ip->i_ino)) + iput(VFS_I(sc->ip)); + sc->ip = NULL; + } + if (sc->buf) { + kmem_free(sc->buf); + sc->buf = NULL; + } + return error; +} + +/* Scrubbing dispatch. */ + +static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { + { /* ioctl presence test */ + .setup = xfs_scrub_setup_fs, + .scrub = xfs_scrub_probe, + }, + { /* superblock */ + .setup = xfs_scrub_setup_ag_header, + .scrub = xfs_scrub_superblock, + }, + { /* agf */ + .setup = xfs_scrub_setup_ag_header, + .scrub = xfs_scrub_agf, + }, + { /* agfl */ + .setup = xfs_scrub_setup_ag_header, + .scrub = xfs_scrub_agfl, + }, + { /* agi */ + .setup = xfs_scrub_setup_ag_header, + .scrub = xfs_scrub_agi, + }, + { /* bnobt */ + .setup = xfs_scrub_setup_ag_allocbt, + .scrub = xfs_scrub_bnobt, + }, + { /* cntbt */ + .setup = xfs_scrub_setup_ag_allocbt, + .scrub = xfs_scrub_cntbt, + }, + { /* inobt */ + .setup = xfs_scrub_setup_ag_iallocbt, + .scrub = xfs_scrub_inobt, + }, + { /* finobt */ + .setup = xfs_scrub_setup_ag_iallocbt, + .scrub = xfs_scrub_finobt, + .has = xfs_sb_version_hasfinobt, + }, + { /* rmapbt */ + .setup = xfs_scrub_setup_ag_rmapbt, + .scrub = xfs_scrub_rmapbt, + .has = xfs_sb_version_hasrmapbt, + }, + { /* refcountbt */ + .setup = xfs_scrub_setup_ag_refcountbt, + .scrub = xfs_scrub_refcountbt, + .has = xfs_sb_version_hasreflink, + }, + { /* inode record */ + .setup = xfs_scrub_setup_inode, + .scrub = xfs_scrub_inode, + }, + { /* inode data fork */ + .setup = xfs_scrub_setup_inode_bmap, + .scrub = xfs_scrub_bmap_data, + }, + { /* inode attr fork */ + .setup = xfs_scrub_setup_inode_bmap, + .scrub = xfs_scrub_bmap_attr, + }, + { /* inode CoW fork */ + .setup = xfs_scrub_setup_inode_bmap, + .scrub = xfs_scrub_bmap_cow, + }, + { /* directory */ + .setup = xfs_scrub_setup_directory, + .scrub = xfs_scrub_directory, + }, + { /* extended attributes */ + .setup = xfs_scrub_setup_xattr, + .scrub = xfs_scrub_xattr, + }, + { /* symbolic link */ + .setup = xfs_scrub_setup_symlink, + .scrub = xfs_scrub_symlink, + }, + { /* parent pointers */ + .setup = xfs_scrub_setup_parent, + .scrub = xfs_scrub_parent, + }, + { /* realtime bitmap */ + .setup = xfs_scrub_setup_rt, + .scrub = xfs_scrub_rtbitmap, + .has = xfs_sb_version_hasrealtime, + }, + { /* realtime summary */ + .setup = xfs_scrub_setup_rt, + .scrub = xfs_scrub_rtsummary, + .has = xfs_sb_version_hasrealtime, + }, + { /* user quota */ + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, + }, + { /* group quota */ + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, + }, + { /* project quota */ + .setup = xfs_scrub_setup_quota, + .scrub = xfs_scrub_quota, + }, +}; + +/* This isn't a stable feature, warn once per day. */ +static inline void +xfs_scrub_experimental_warning( + struct xfs_mount *mp) +{ + static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( + "xfs_scrub_warning", 86400 * HZ, 1); + ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); + + if (__ratelimit(&scrub_warning)) + xfs_alert(mp, +"EXPERIMENTAL online scrub feature in use. Use at your own risk!"); +} + +/* Dispatch metadata scrubbing. */ +int +xfs_scrub_metadata( + struct xfs_inode *ip, + struct xfs_scrub_metadata *sm) +{ + struct xfs_scrub_context sc; + struct xfs_mount *mp = ip->i_mount; + const struct xfs_scrub_meta_ops *ops; + bool try_harder = false; + int error = 0; + + trace_xfs_scrub_start(ip, sm, error); + + /* Forbidden if we are shut down or mounted norecovery. */ + error = -ESHUTDOWN; + if (XFS_FORCED_SHUTDOWN(mp)) + goto out; + error = -ENOTRECOVERABLE; + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + goto out; + + /* Check our inputs. */ + error = -EINVAL; + sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) + goto out; + if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) + goto out; + + /* Do we know about this type of metadata? */ + error = -ENOENT; + if (sm->sm_type >= XFS_SCRUB_TYPE_NR) + goto out; + ops = &meta_scrub_ops[sm->sm_type]; + if (ops->scrub == NULL) + goto out; + + /* + * We won't scrub any filesystem that doesn't have the ability + * to record unwritten extents. The option was made default in + * 2003, removed from mkfs in 2007, and cannot be disabled in + * v5, so if we find a filesystem without this flag it's either + * really old or totally unsupported. Avoid it either way. + * We also don't support v1-v3 filesystems, which aren't + * mountable. + */ + error = -EOPNOTSUPP; + if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) + goto out; + + /* Does this fs even support this type of metadata? */ + error = -ENOENT; + if (ops->has && !ops->has(&mp->m_sb)) + goto out; + + /* We don't know how to repair anything yet. */ + error = -EOPNOTSUPP; + if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + goto out; + + xfs_scrub_experimental_warning(mp); + +retry_op: + /* Set up for the operation. */ + memset(&sc, 0, sizeof(sc)); + sc.mp = ip->i_mount; + sc.sm = sm; + sc.ops = ops; + sc.try_harder = try_harder; + sc.sa.agno = NULLAGNUMBER; + error = sc.ops->setup(&sc, ip); + if (error) + goto out_teardown; + + /* Scrub for errors. */ + error = sc.ops->scrub(&sc); + if (!try_harder && error == -EDEADLOCK) { + /* + * Scrubbers return -EDEADLOCK to mean 'try harder'. + * Tear down everything we hold, then set up again with + * preparation for worst-case scenarios. + */ + error = xfs_scrub_teardown(&sc, ip, 0); + if (error) + goto out; + try_harder = true; + goto retry_op; + } else if (error) + goto out_teardown; + + if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT)) + xfs_alert_ratelimited(mp, "Corruption detected during scrub."); + +out_teardown: + error = xfs_scrub_teardown(&sc, ip, error); +out: + trace_xfs_scrub_done(ip, sm, error); + if (error == -EFSCORRUPTED || error == -EFSBADCRC) { + sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + error = 0; + } + return error; +} diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h new file mode 100644 index 000000000000..e9ec041cf713 --- /dev/null +++ b/fs/xfs/scrub/scrub.h @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_SCRUB_H__ +#define __XFS_SCRUB_SCRUB_H__ + +struct xfs_scrub_context; + +struct xfs_scrub_meta_ops { + /* Acquire whatever resources are needed for the operation. */ + int (*setup)(struct xfs_scrub_context *, + struct xfs_inode *); + + /* Examine metadata for errors. */ + int (*scrub)(struct xfs_scrub_context *); + + /* Decide if we even have this piece of metadata. */ + bool (*has)(struct xfs_sb *); +}; + +/* Buffer pointers and btree cursors for an entire AG. */ +struct xfs_scrub_ag { + xfs_agnumber_t agno; + + /* AG btree roots */ + struct xfs_buf *agf_bp; + struct xfs_buf *agfl_bp; + struct xfs_buf *agi_bp; + + /* AG btrees */ + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + struct xfs_btree_cur *ino_cur; + struct xfs_btree_cur *fino_cur; + struct xfs_btree_cur *rmap_cur; + struct xfs_btree_cur *refc_cur; +}; + +struct xfs_scrub_context { + /* General scrub state. */ + struct xfs_mount *mp; + struct xfs_scrub_metadata *sm; + const struct xfs_scrub_meta_ops *ops; + struct xfs_trans *tp; + struct xfs_inode *ip; + void *buf; + uint ilock_flags; + bool try_harder; + + /* State tracking for single-AG operations. */ + struct xfs_scrub_ag sa; +}; + +/* Metadata scrubbers */ +int xfs_scrub_tester(struct xfs_scrub_context *sc); +int xfs_scrub_superblock(struct xfs_scrub_context *sc); +int xfs_scrub_agf(struct xfs_scrub_context *sc); +int xfs_scrub_agfl(struct xfs_scrub_context *sc); +int xfs_scrub_agi(struct xfs_scrub_context *sc); +int xfs_scrub_bnobt(struct xfs_scrub_context *sc); +int xfs_scrub_cntbt(struct xfs_scrub_context *sc); +int xfs_scrub_inobt(struct xfs_scrub_context *sc); +int xfs_scrub_finobt(struct xfs_scrub_context *sc); +int xfs_scrub_rmapbt(struct xfs_scrub_context *sc); +int xfs_scrub_refcountbt(struct xfs_scrub_context *sc); +int xfs_scrub_inode(struct xfs_scrub_context *sc); +int xfs_scrub_bmap_data(struct xfs_scrub_context *sc); +int xfs_scrub_bmap_attr(struct xfs_scrub_context *sc); +int xfs_scrub_bmap_cow(struct xfs_scrub_context *sc); +int xfs_scrub_directory(struct xfs_scrub_context *sc); +int xfs_scrub_xattr(struct xfs_scrub_context *sc); +int xfs_scrub_symlink(struct xfs_scrub_context *sc); +int xfs_scrub_parent(struct xfs_scrub_context *sc); +#ifdef CONFIG_XFS_RT +int xfs_scrub_rtbitmap(struct xfs_scrub_context *sc); +int xfs_scrub_rtsummary(struct xfs_scrub_context *sc); +#else +static inline int +xfs_scrub_rtbitmap(struct xfs_scrub_context *sc) +{ + return -ENOENT; +} +static inline int +xfs_scrub_rtsummary(struct xfs_scrub_context *sc) +{ + return -ENOENT; +} +#endif +#ifdef CONFIG_XFS_QUOTA +int xfs_scrub_quota(struct xfs_scrub_context *sc); +#else +static inline int +xfs_scrub_quota(struct xfs_scrub_context *sc) +{ + return -ENOENT; +} +#endif + +#endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c new file mode 100644 index 000000000000..3aa3d60f7c16 --- /dev/null +++ b/fs/xfs/scrub/symlink.c @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_symlink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Set us up to scrub a symbolic link. */ +int +xfs_scrub_setup_symlink( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + /* Allocate the buffer without the inode lock held. */ + sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP); + if (!sc->buf) + return -ENOMEM; + + return xfs_scrub_setup_inode_contents(sc, ip, 0); +} + +/* Symbolic links. */ + +int +xfs_scrub_symlink( + struct xfs_scrub_context *sc) +{ + struct xfs_inode *ip = sc->ip; + struct xfs_ifork *ifp; + loff_t len; + int error = 0; + + if (!S_ISLNK(VFS_I(ip)->i_mode)) + return -ENOENT; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + len = ip->i_d.di_size; + + /* Plausible size? */ + if (len > XFS_SYMLINK_MAXLEN || len <= 0) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* Inline symlink? */ + if (ifp->if_flags & XFS_IFINLINE) { + if (len > XFS_IFORK_DSIZE(ip) || + len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip))) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out; + } + + /* Remote symlink; must read the contents. */ + error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out; + if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); +out: + return error; +} diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c new file mode 100644 index 000000000000..472080e75788 --- /dev/null +++ b/fs/xfs/scrub/trace.c @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_da_format.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" + +/* Figure out which block the btree cursor was pointing to. */ +static inline xfs_fsblock_t +xfs_scrub_btree_cur_fsbno( + struct xfs_btree_cur *cur, + int level) +{ + if (level < cur->bc_nlevels && cur->bc_bufs[level]) + return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn); + else if (level == cur->bc_nlevels - 1 && + cur->bc_flags & XFS_BTREE_LONG_PTRS) + return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino); + else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) + return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0); + return NULLFSBLOCK; +} + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "scrub/trace.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h new file mode 100644 index 000000000000..c4ebfb5c1ee8 --- /dev/null +++ b/fs/xfs/scrub/trace.h @@ -0,0 +1,499 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xfs_scrub + +#if !defined(_TRACE_XFS_SCRUB_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XFS_SCRUB_TRACE_H + +#include <linux/tracepoint.h> +#include "xfs_bit.h" + +DECLARE_EVENT_CLASS(xfs_scrub_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, + int error), + TP_ARGS(ip, sm, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->type = sm->sm_type; + __entry->agno = sm->sm_agno; + __entry->inum = sm->sm_ino; + __entry->gen = sm->sm_gen; + __entry->flags = sm->sm_flags; + __entry->error = error; + ), + TP_printk("dev %d:%d ino %llu type %u agno %u inum %llu gen %u flags 0x%x error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->type, + __entry->agno, + __entry->inum, + __entry->gen, + __entry->flags, + __entry->error) +) +#define DEFINE_SCRUB_EVENT(name) \ +DEFINE_EVENT(xfs_scrub_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, \ + int error), \ + TP_ARGS(ip, sm, error)) + +DEFINE_SCRUB_EVENT(xfs_scrub_start); +DEFINE_SCRUB_EVENT(xfs_scrub_done); +DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry); + +TRACE_EVENT(xfs_scrub_op_error, + TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno, + xfs_agblock_t bno, int error, void *ret_ip), + TP_ARGS(sc, agno, bno, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->agno = agno; + __entry->bno = bno; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->bno, + __entry->error, + __entry->ret_ip) +); + +TRACE_EVENT(xfs_scrub_file_op_error, + TP_PROTO(struct xfs_scrub_context *sc, int whichfork, + xfs_fileoff_t offset, int error, void *ret_ip), + TP_ARGS(sc, whichfork, offset, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_fileoff_t, offset) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->ip->i_mount->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = whichfork; + __entry->type = sc->sm->sm_type; + __entry->offset = offset; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu error %d ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->whichfork, + __entry->type, + __entry->offset, + __entry->error, + __entry->ret_ip) +); + +DECLARE_EVENT_CLASS(xfs_scrub_block_error_class, + TP_PROTO(struct xfs_scrub_context *sc, xfs_daddr_t daddr, void *ret_ip), + TP_ARGS(sc, daddr, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno; + xfs_agnumber_t agno; + xfs_agblock_t bno; + + fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr); + agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->agno = agno; + __entry->bno = bno; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->bno, + __entry->ret_ip) +) + +#define DEFINE_SCRUB_BLOCK_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_scrub_block_error_class, name, \ + TP_PROTO(struct xfs_scrub_context *sc, xfs_daddr_t daddr, \ + void *ret_ip), \ + TP_ARGS(sc, daddr, ret_ip)) + +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_error); +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_preen); + +DECLARE_EVENT_CLASS(xfs_scrub_ino_error_class, + TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, xfs_daddr_t daddr, + void *ret_ip), + TP_ARGS(sc, ino, daddr, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno; + xfs_agnumber_t agno; + xfs_agblock_t bno; + + if (daddr) { + fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr); + agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + } else { + agno = XFS_INO_TO_AGNO(sc->mp, ino); + bno = XFS_AGINO_TO_AGBNO(sc->mp, + XFS_INO_TO_AGINO(sc->mp, ino)); + } + + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = ino; + __entry->type = sc->sm->sm_type; + __entry->agno = agno; + __entry->bno = bno; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino %llu type %u agno %u agbno %u ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->type, + __entry->agno, + __entry->bno, + __entry->ret_ip) +) + +#define DEFINE_SCRUB_INO_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_scrub_ino_error_class, name, \ + TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, \ + xfs_daddr_t daddr, void *ret_ip), \ + TP_ARGS(sc, ino, daddr, ret_ip)) + +DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_error); +DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_preen); +DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_warning); + +DECLARE_EVENT_CLASS(xfs_scrub_fblock_error_class, + TP_PROTO(struct xfs_scrub_context *sc, int whichfork, + xfs_fileoff_t offset, void *ret_ip), + TP_ARGS(sc, whichfork, offset, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_fileoff_t, offset) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->ip->i_mount->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = whichfork; + __entry->type = sc->sm->sm_type; + __entry->offset = offset; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->whichfork, + __entry->type, + __entry->offset, + __entry->ret_ip) +); + +#define DEFINE_SCRUB_FBLOCK_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_scrub_fblock_error_class, name, \ + TP_PROTO(struct xfs_scrub_context *sc, int whichfork, \ + xfs_fileoff_t offset, void *ret_ip), \ + TP_ARGS(sc, whichfork, offset, ret_ip)) + +DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_error); +DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_warning); + +TRACE_EVENT(xfs_scrub_incomplete, + TP_PROTO(struct xfs_scrub_context *sc, void *ret_ip), + TP_ARGS(sc, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->ret_ip) +); + +TRACE_EVENT(xfs_scrub_btree_op_error, + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + int level, int error, void *ret_ip), + TP_ARGS(sc, cur, level, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, ptr); + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level); + + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->ptr = cur->bc_ptrs[level]; + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->btnum, + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->error, + __entry->ret_ip) +); + +TRACE_EVENT(xfs_scrub_ifork_btree_op_error, + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + int level, int error, void *ret_ip), + TP_ARGS(sc, cur, level, error, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(int, ptr) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, error) + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level); + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = cur->bc_private.b.whichfork; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->ptr = cur->bc_ptrs[level]; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->error = error; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->whichfork, + __entry->type, + __entry->btnum, + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->error, + __entry->ret_ip) +); + +TRACE_EVENT(xfs_scrub_btree_error, + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + int level, void *ret_ip), + TP_ARGS(sc, cur, level, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, ptr); + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level); + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->ptr = cur->bc_ptrs[level]; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->btnum, + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->ret_ip) +); + +TRACE_EVENT(xfs_scrub_ifork_btree_error, + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + int level, void *ret_ip), + TP_ARGS(sc, cur, level, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(unsigned int, type) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, ptr); + __field(void *, ret_ip) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level); + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->ip->i_ino; + __entry->whichfork = cur->bc_private.b.whichfork; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->ptr = cur->bc_ptrs[level]; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->whichfork, + __entry->type, + __entry->btnum, + __entry->level, + __entry->ptr, + __entry->agno, + __entry->bno, + __entry->ret_ip) +); + +DECLARE_EVENT_CLASS(xfs_scrub_sbtree_class, + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, + int level), + TP_ARGS(sc, cur, level), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bno) + __field(int, level) + __field(int, nlevels) + __field(int, ptr) + ), + TP_fast_assign( + xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level); + + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->btnum = cur->bc_btnum; + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); + __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + __entry->level = level; + __entry->nlevels = cur->bc_nlevels; + __entry->ptr = cur->bc_ptrs[level]; + ), + TP_printk("dev %d:%d type %u btnum %d agno %u agbno %u level %d nlevels %d ptr %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->btnum, + __entry->agno, + __entry->bno, + __entry->level, + __entry->nlevels, + __entry->ptr) +) +#define DEFINE_SCRUB_SBTREE_EVENT(name) \ +DEFINE_EVENT(xfs_scrub_sbtree_class, name, \ + TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, \ + int level), \ + TP_ARGS(sc, cur, level)) + +DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_rec); +DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_key); + +#endif /* _TRACE_XFS_SCRUB_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE scrub/trace +#include <trace/define_trace.h> diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h new file mode 100644 index 000000000000..e00e0eadac6a --- /dev/null +++ b/fs/xfs/scrub/xfs_scrub.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_H__ +#define __XFS_SCRUB_H__ + +#ifndef CONFIG_XFS_ONLINE_SCRUB +# define xfs_scrub_metadata(ip, sm) (-ENOTTY) +#else +int xfs_scrub_metadata(struct xfs_inode *ip, struct xfs_scrub_metadata *sm); +#endif /* CONFIG_XFS_ONLINE_SCRUB */ + +#endif /* __XFS_SCRUB_H__ */ diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 80cd0fd86783..5ff7f228d616 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -19,7 +19,6 @@ #define __XFS_H__ #ifdef CONFIG_XFS_DEBUG -#define STATIC #define DEBUG 1 #define XFS_BUF_LOCK_TRACKING 1 #endif diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 7034e17535de..3354140de07e 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_t mode) int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + umode_t mode; + bool set_mode = false; int error = 0; if (!acl) @@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; if (type == ACL_TYPE_ACCESS) { - umode_t mode; - error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - error = xfs_set_mode(inode, mode); - if (error) - return error; + set_mode = true; } set_acl: - return __xfs_set_acl(inode, acl, type); + error = __xfs_set_acl(inode, acl, type); + if (error) + return error; + + /* + * We set the mode after successfully updating the ACL xattr because the + * xattr update can fail at ENOSPC and we don't want to change the mode + * if the ACL update hasn't been applied. + */ + if (set_mode) + error = xfs_set_mode(inode, mode); + + return error; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f18e5932aec4..a3eeaba156c5 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -446,6 +446,19 @@ xfs_imap_valid( { offset >>= inode->i_blkbits; + /* + * We have to make sure the cached mapping is within EOF to protect + * against eofblocks trimming on file release leaving us with a stale + * mapping. Otherwise, a page for a subsequent file extending buffered + * write could get picked up by this writeback cycle and written to the + * wrong blocks. + * + * Note that what we really want here is a generic mapping invalidation + * mechanism to protect us from arbitrary extent modifying contexts, not + * just eofblocks. + */ + xfs_trim_extent_eof(imap, XFS_I(inode)); + return offset >= imap->br_startoff && offset < imap->br_startoff + imap->br_blockcount; } @@ -735,6 +748,14 @@ xfs_vm_invalidatepage( { trace_xfs_invalidatepage(page->mapping->host, page, offset, length); + + /* + * If we are invalidating the entire page, clear the dirty state from it + * so that we can check for attempts to release dirty cached pages in + * xfs_vm_releasepage(). + */ + if (offset == 0 && length >= PAGE_SIZE) + cancel_dirty_page(page); block_invalidatepage(page, offset, length); } @@ -1190,25 +1211,27 @@ xfs_vm_releasepage( * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to * ->releasepage() via shrink_active_list(). Conversely, - * block_invalidatepage() can send pages that are still marked dirty - * but otherwise have invalidated buffers. + * block_invalidatepage() can send pages that are still marked dirty but + * otherwise have invalidated buffers. * * We want to release the latter to avoid unnecessary buildup of the - * LRU, skip the former and warn if we've left any lingering - * delalloc/unwritten buffers on clean pages. Skip pages with delalloc - * or unwritten buffers and warn if the page is not dirty. Otherwise - * try to release the buffers. + * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages + * that are entirely invalidated and need to be released. Hence the + * only time we should get dirty pages here is through + * shrink_active_list() and so we can simply skip those now. + * + * warn if we've left any lingering delalloc/unwritten buffers on clean + * or invalidated pages we are about to release. */ + if (PageDirty(page)) + return 0; + xfs_count_page_state(page, &delalloc, &unwritten); - if (delalloc) { - WARN_ON_ONCE(!PageDirty(page)); + if (WARN_ON_ONCE(delalloc)) return 0; - } - if (unwritten) { - WARN_ON_ONCE(!PageDirty(page)); + if (WARN_ON_ONCE(unwritten)) return 0; - } return try_to_free_buffers(page); } diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 5d5a5e277f35..d07bf27451c9 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -48,6 +48,8 @@ struct xfs_attr_list_context; #define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ +#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ + #define XFS_ATTR_FLAGS \ { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ { ATTR_ROOT, "ROOT" }, \ @@ -56,7 +58,8 @@ struct xfs_attr_list_context; { ATTR_CREATE, "CREATE" }, \ { ATTR_REPLACE, "REPLACE" }, \ { ATTR_KERNOTIME, "KERNOTIME" }, \ - { ATTR_KERNOVAL, "KERNOVAL" } + { ATTR_KERNOVAL, "KERNOVAL" }, \ + { ATTR_INCOMPLETE, "INCOMPLETE" } /* * The maximum size (into the kernel or returned from the kernel) of an diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index ebd66b19fbfc..52818ea2eb50 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -251,47 +251,44 @@ xfs_attr3_node_inactive( * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK); + error = xfs_da3_node_read(*trans, dp, child_fsb, -1, &child_bp, + XFS_ATTR_FORK); if (error) return error; - if (child_bp) { - /* save for re-read later */ - child_blkno = XFS_BUF_ADDR(child_bp); - /* - * Invalidate the subtree, however we have to. - */ - info = child_bp->b_addr; - switch (info->magic) { - case cpu_to_be16(XFS_DA_NODE_MAGIC): - case cpu_to_be16(XFS_DA3_NODE_MAGIC): - error = xfs_attr3_node_inactive(trans, dp, - child_bp, level + 1); - break; - case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): - case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): - error = xfs_attr3_leaf_inactive(trans, dp, - child_bp); - break; - default: - error = -EIO; - xfs_trans_brelse(*trans, child_bp); - break; - } - if (error) - return error; + /* save for re-read later */ + child_blkno = XFS_BUF_ADDR(child_bp); - /* - * Remove the subsidiary block from the cache - * and from the log. - */ - error = xfs_da_get_buf(*trans, dp, 0, child_blkno, - &child_bp, XFS_ATTR_FORK); - if (error) - return error; - xfs_trans_binval(*trans, child_bp); + /* + * Invalidate the subtree, however we have to. + */ + info = child_bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, child_bp, + level + 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, child_bp); + break; + default: + error = -EIO; + xfs_trans_brelse(*trans, child_bp); + break; } + if (error) + return error; + + /* + * Remove the subsidiary block from the cache and from the log. + */ + error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp, + XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, child_bp); /* * If we're not done, re-read the parent to get the next @@ -302,6 +299,8 @@ xfs_attr3_node_inactive( &bp, XFS_ATTR_FORK); if (error) return error; + node = bp->b_addr; + btree = dp->d_ops->node_tree_p(node); child_fsb = be32_to_cpu(btree[i + 1].before); xfs_trans_brelse(*trans, bp); } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 7740c8a5e736..3e59a348ea71 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -204,19 +204,103 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) return 0; } +/* + * We didn't find the block & hash mentioned in the cursor state, so + * walk down the attr btree looking for the hash. + */ STATIC int -xfs_attr_node_list(xfs_attr_list_context_t *context) +xfs_attr_node_list_lookup( + struct xfs_attr_list_context *context, + struct attrlist_cursor_kern *cursor, + struct xfs_buf **pbp) { - attrlist_cursor_kern_t *cursor; - xfs_attr_leafblock_t *leaf; - xfs_da_intnode_t *node; - struct xfs_attr3_icleaf_hdr leafhdr; - struct xfs_da3_icnode_hdr nodehdr; - struct xfs_da_node_entry *btree; - int error, i; - struct xfs_buf *bp; - struct xfs_inode *dp = context->dp; - struct xfs_mount *mp = dp->i_mount; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_inode *dp = context->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = context->tp; + struct xfs_buf *bp; + int i; + int error = 0; + unsigned int expected_level = 0; + uint16_t magic; + + ASSERT(*pbp == NULL); + cursor->blkno = 0; + for (;;) { + error = xfs_da3_node_read(tp, dp, cursor->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return error; + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + node); + goto out_corruptbuf; + } + + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + + /* Tree taller than we can handle; bail out! */ + if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + goto out_corruptbuf; + + /* Check the level from the root node. */ + if (cursor->blkno == 0) + expected_level = nodehdr.level - 1; + else if (expected_level != nodehdr.level) + goto out_corruptbuf; + else + expected_level--; + + btree = dp->d_ops->node_tree_p(node); + for (i = 0; i < nodehdr.count; btree++, i++) { + if (cursor->hashval <= be32_to_cpu(btree->hashval)) { + cursor->blkno = be32_to_cpu(btree->before); + trace_xfs_attr_list_node_descend(context, + btree); + break; + } + } + xfs_trans_brelse(tp, bp); + + if (i == nodehdr.count) + return 0; + + /* We can't point back to the root. */ + if (cursor->blkno == 0) + return -EFSCORRUPTED; + } + + if (expected_level != 0) + goto out_corruptbuf; + + *pbp = bp; + return 0; + +out_corruptbuf: + xfs_trans_brelse(tp, bp); + return -EFSCORRUPTED; +} + +STATIC int +xfs_attr_node_list( + struct xfs_attr_list_context *context) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct attrlist_cursor_kern *cursor; + struct xfs_attr_leafblock *leaf; + struct xfs_da_intnode *node; + struct xfs_buf *bp; + struct xfs_inode *dp = context->dp; + struct xfs_mount *mp = dp->i_mount; + int error; trace_xfs_attr_node_list(context); @@ -277,47 +361,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) * Note that start of node block is same as start of leaf block. */ if (bp == NULL) { - cursor->blkno = 0; - for (;;) { - uint16_t magic; - - error = xfs_da3_node_read(context->tp, dp, - cursor->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) - return error; - node = bp->b_addr; - magic = be16_to_cpu(node->hdr.info.magic); - if (magic == XFS_ATTR_LEAF_MAGIC || - magic == XFS_ATTR3_LEAF_MAGIC) - break; - if (magic != XFS_DA_NODE_MAGIC && - magic != XFS_DA3_NODE_MAGIC) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, - node); - xfs_trans_brelse(context->tp, bp); - return -EFSCORRUPTED; - } - - dp->d_ops->node_hdr_from_disk(&nodehdr, node); - btree = dp->d_ops->node_tree_p(node); - for (i = 0; i < nodehdr.count; btree++, i++) { - if (cursor->hashval - <= be32_to_cpu(btree->hashval)) { - cursor->blkno = be32_to_cpu(btree->before); - trace_xfs_attr_list_node_descend(context, - btree); - break; - } - } - if (i == nodehdr.count) { - xfs_trans_brelse(context->tp, bp); - return 0; - } - xfs_trans_brelse(context->tp, bp); - } + error = xfs_attr_node_list_lookup(context, cursor, &bp); + if (error || !bp) + return error; } ASSERT(bp != NULL); @@ -407,7 +453,8 @@ xfs_attr3_leaf_list_int( cursor->offset = 0; } - if (entry->flags & XFS_ATTR_INCOMPLETE) + if ((entry->flags & XFS_ATTR_INCOMPLETE) && + !(context->flags & ATTR_INCOMPLETE)) continue; /* skip incomplete entries */ if (entry->flags & XFS_ATTR_LOCAL) { @@ -499,8 +546,8 @@ xfs_attr_list_int( #define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ (((struct attrlist_ent *) 0)->a_name - (char *) 0) #define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ - & ~(sizeof(u_int32_t)-1)) + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \ + & ~(sizeof(uint32_t)-1)) /* * Format an attribute and copy it out to the user's buffer. @@ -583,6 +630,10 @@ xfs_attr_list( (cursor->hashval || cursor->blkno || cursor->offset)) return -EINVAL; + /* Only internal consumers can retrieve incomplete attrs. */ + if (flags & ATTR_INCOMPLETE) + return -EINVAL; + /* * Check for a properly aligned buffer. */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e9db7fc95b70..6d37ab43195f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -84,6 +84,7 @@ xfs_zero_extent( GFP_NOFS, 0); } +#ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ @@ -190,6 +191,7 @@ xfs_bmap_rtalloc( } return 0; } +#endif /* CONFIG_XFS_RT */ /* * Check if the endoff is outside the last extent. If so the caller will grow @@ -227,15 +229,17 @@ xfs_bmap_count_leaves( struct xfs_ifork *ifp, xfs_filblks_t *count) { + struct xfs_iext_cursor icur; struct xfs_bmbt_irec got; - xfs_extnum_t numrecs = 0, i = 0; + xfs_extnum_t numrecs = 0; - while (xfs_iext_get_extent(ifp, i++, &got)) { + for_each_xfs_iext(ifp, &icur, &got) { if (!isnullstartblock(got.br_startblock)) { *count += got.br_blockcount; numrecs++; } } + return numrecs; } @@ -403,125 +407,103 @@ xfs_bmap_count_blocks( return 0; } -/* - * returns 1 for success, 0 if we failed to map the extent. - */ -STATIC int -xfs_getbmapx_fix_eof_hole( - xfs_inode_t *ip, /* xfs incore inode pointer */ - int whichfork, - struct getbmapx *out, /* output structure */ - int prealloced, /* this is a file with - * preallocated data space */ - int64_t end, /* last block requested */ - xfs_fsblock_t startblock, - bool moretocome) +static int +xfs_getbmap_report_one( + struct xfs_inode *ip, + struct getbmapx *bmv, + struct kgetbmap *out, + int64_t bmv_end, + struct xfs_bmbt_irec *got) { - int64_t fixlen; - xfs_mount_t *mp; /* file system mount point */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent pointer */ - xfs_fileoff_t fileblock; - - if (startblock == HOLESTARTBLOCK) { - mp = ip->i_mount; - out->bmv_block = -1; - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); - fixlen -= out->bmv_offset; - if (prealloced && out->bmv_offset + out->bmv_length == end) { - /* Came to hole at EOF. Trim it. */ - if (fixlen <= 0) - return 0; - out->bmv_length = fixlen; - } + struct kgetbmap *p = out + bmv->bmv_entries; + bool shared = false, trimmed = false; + int error; + + error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed); + if (error) + return error; + + if (isnullstartblock(got->br_startblock) || + got->br_startblock == DELAYSTARTBLOCK) { + /* + * Delalloc extents that start beyond EOF can occur due to + * speculative EOF allocation when the delalloc extent is larger + * than the largest freespace extent at conversion time. These + * extents cannot be converted by data writeback, so can exist + * here even if we are not supposed to be finding delalloc + * extents. + */ + if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip))) + ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0); + + p->bmv_oflags |= BMV_OF_DELALLOC; + p->bmv_block = -2; } else { - if (startblock == DELAYSTARTBLOCK) - out->bmv_block = -2; - else - out->bmv_block = xfs_fsb_to_db(ip, startblock); - fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!moretocome && - xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && - (lastx == xfs_iext_count(ifp) - 1)) - out->bmv_oflags |= BMV_OF_LAST; + p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock); } - return 1; + if (got->br_state == XFS_EXT_UNWRITTEN && + (bmv->bmv_iflags & BMV_IF_PREALLOC)) + p->bmv_oflags |= BMV_OF_PREALLOC; + + if (shared) + p->bmv_oflags |= BMV_OF_SHARED; + + p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff); + p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount); + + bmv->bmv_offset = p->bmv_offset + p->bmv_length; + bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset); + bmv->bmv_entries++; + return 0; } -/* Adjust the reported bmap around shared/unshared extent transitions. */ -STATIC int -xfs_getbmap_adjust_shared( - struct xfs_inode *ip, - int whichfork, - struct xfs_bmbt_irec *map, - struct getbmapx *out, - struct xfs_bmbt_irec *next_map) +static void +xfs_getbmap_report_hole( + struct xfs_inode *ip, + struct getbmapx *bmv, + struct kgetbmap *out, + int64_t bmv_end, + xfs_fileoff_t bno, + xfs_fileoff_t end) { - struct xfs_mount *mp = ip->i_mount; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - xfs_agblock_t ebno; - xfs_extlen_t elen; - xfs_extlen_t nlen; - int error; + struct kgetbmap *p = out + bmv->bmv_entries; - next_map->br_startblock = NULLFSBLOCK; - next_map->br_startoff = NULLFILEOFF; - next_map->br_blockcount = 0; + if (bmv->bmv_iflags & BMV_IF_NO_HOLES) + return; - /* Only written data blocks can be shared. */ - if (!xfs_is_reflink_inode(ip) || - whichfork != XFS_DATA_FORK || - !xfs_bmap_is_real_extent(map)) - return 0; + p->bmv_block = -1; + p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno); + p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno); - agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); - agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock); - error = xfs_reflink_find_shared(mp, NULL, agno, agbno, - map->br_blockcount, &ebno, &elen, true); - if (error) - return error; + bmv->bmv_offset = p->bmv_offset + p->bmv_length; + bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset); + bmv->bmv_entries++; +} - if (ebno == NULLAGBLOCK) { - /* No shared blocks at all. */ - return 0; - } else if (agbno == ebno) { - /* - * Shared extent at (agbno, elen). Shrink the reported - * extent length and prepare to move the start of map[i] - * to agbno+elen, with the aim of (re)formatting the new - * map[i] the next time through the inner loop. - */ - out->bmv_length = XFS_FSB_TO_BB(mp, elen); - out->bmv_oflags |= BMV_OF_SHARED; - if (elen != map->br_blockcount) { - *next_map = *map; - next_map->br_startblock += elen; - next_map->br_startoff += elen; - next_map->br_blockcount -= elen; - } - map->br_blockcount -= elen; - } else { - /* - * There's an unshared extent (agbno, ebno - agbno) - * followed by shared extent at (ebno, elen). Shrink - * the reported extent length to cover only the unshared - * extent and prepare to move up the start of map[i] to - * ebno, with the aim of (re)formatting the new map[i] - * the next time through the inner loop. - */ - *next_map = *map; - nlen = ebno - agbno; - out->bmv_length = XFS_FSB_TO_BB(mp, nlen); - next_map->br_startblock += nlen; - next_map->br_startoff += nlen; - next_map->br_blockcount -= nlen; - map->br_blockcount -= nlen; - } +static inline bool +xfs_getbmap_full( + struct getbmapx *bmv) +{ + return bmv->bmv_length == 0 || bmv->bmv_entries >= bmv->bmv_count - 1; +} - return 0; +static bool +xfs_getbmap_next_rec( + struct xfs_bmbt_irec *rec, + xfs_fileoff_t total_end) +{ + xfs_fileoff_t end = rec->br_startoff + rec->br_blockcount; + + if (end == total_end) + return false; + + rec->br_startoff += rec->br_blockcount; + if (!isnullstartblock(rec->br_startblock) && + rec->br_startblock != DELAYSTARTBLOCK) + rec->br_startblock += rec->br_blockcount; + rec->br_blockcount = total_end - end; + return true; } /* @@ -533,33 +515,22 @@ xfs_getbmap_adjust_shared( */ int /* error code */ xfs_getbmap( - xfs_inode_t *ip, + struct xfs_inode *ip, struct getbmapx *bmv, /* user bmap structure */ - xfs_bmap_format_t formatter, /* format to user */ - void *arg) /* formatter arg */ + struct kgetbmap *out) { - int64_t bmvend; /* last block requested */ - int error = 0; /* return value */ - int64_t fixlen; /* length for -1 case */ - int i; /* extent number */ - int lock; /* lock state */ - xfs_bmbt_irec_t *map; /* buffer for user's data */ - xfs_mount_t *mp; /* file system mount point */ - int nex; /* # of user extents can do */ - int subnex; /* # of bmapi's can do */ - int nmap; /* number of map entries */ - struct getbmapx *out; /* output structure */ - int whichfork; /* data or attr fork */ - int prealloced; /* this is a file with - * preallocated data space */ - int iflags; /* interface flags */ - int bmapi_flags; /* flags for xfs_bmapi */ - int cur_ext = 0; - struct xfs_bmbt_irec inject_map; - - mp = ip->i_mount; - iflags = bmv->bmv_iflags; - + struct xfs_mount *mp = ip->i_mount; + int iflags = bmv->bmv_iflags; + int whichfork, lock, error = 0; + int64_t bmv_end, max_len; + xfs_fileoff_t bno, first_bno; + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got, rec; + xfs_filblks_t len; + struct xfs_iext_cursor icur; + + if (bmv->bmv_iflags & ~BMV_IF_VALID) + return -EINVAL; #ifndef DEBUG /* Only allow CoW fork queries if we're debugging. */ if (iflags & BMV_IF_COWFORK) @@ -568,89 +539,42 @@ xfs_getbmap( if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK)) return -EINVAL; + if (bmv->bmv_length < -1) + return -EINVAL; + bmv->bmv_entries = 0; + if (bmv->bmv_length == 0) + return 0; + if (iflags & BMV_IF_ATTRFORK) whichfork = XFS_ATTR_FORK; else if (iflags & BMV_IF_COWFORK) whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK; + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK: - if (XFS_IFORK_Q(ip)) { - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && - ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) - return -EINVAL; - } else if (unlikely( - ip->i_d.di_aformat != 0 && - ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { - XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, - ip->i_mount); - return -EFSCORRUPTED; - } + if (!XFS_IFORK_Q(ip)) + goto out_unlock_iolock; - prealloced = 0; - fixlen = 1LL << 32; + max_len = 1LL << 32; + lock = xfs_ilock_attr_map_shared(ip); break; case XFS_COW_FORK: - if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS) - return -EINVAL; + /* No CoW fork? Just return */ + if (!ifp) + goto out_unlock_iolock; - if (xfs_get_cowextsz_hint(ip)) { - prealloced = 1; - fixlen = mp->m_super->s_maxbytes; - } else { - prealloced = 0; - fixlen = XFS_ISIZE(ip); - } - break; - default: - /* Local format data forks report no extents. */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { - bmv->bmv_entries = 0; - return 0; - } - if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE) - return -EINVAL; + if (xfs_get_cowextsz_hint(ip)) + max_len = mp->m_super->s_maxbytes; + else + max_len = XFS_ISIZE(ip); - if (xfs_get_extsz_hint(ip) || - ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ - prealloced = 1; - fixlen = mp->m_super->s_maxbytes; - } else { - prealloced = 0; - fixlen = XFS_ISIZE(ip); - } + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); break; - } - - if (bmv->bmv_length == -1) { - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); - bmv->bmv_length = - max_t(int64_t, fixlen - bmv->bmv_offset, 0); - } else if (bmv->bmv_length == 0) { - bmv->bmv_entries = 0; - return 0; - } else if (bmv->bmv_length < 0) { - return -EINVAL; - } - - nex = bmv->bmv_count - 1; - if (nex <= 0) - return -EINVAL; - bmvend = bmv->bmv_offset + bmv->bmv_length; - - - if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) - return -ENOMEM; - out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0); - if (!out) - return -ENOMEM; - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - switch (whichfork) { case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { @@ -668,154 +592,105 @@ xfs_getbmap( */ } + if (xfs_get_extsz_hint(ip) || + (ip->i_d.di_flags & + (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))) + max_len = mp->m_super->s_maxbytes; + else + max_len = XFS_ISIZE(ip); + lock = xfs_ilock_data_map_shared(ip); break; - case XFS_COW_FORK: - lock = XFS_ILOCK_SHARED; - xfs_ilock(ip, lock); - break; - case XFS_ATTR_FORK: - lock = xfs_ilock_attr_map_shared(ip); - break; } - /* - * Don't let nex be bigger than the number of extents - * we can have assuming alternating holes and real extents. - */ - if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) - nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; - - bmapi_flags = xfs_bmapi_aflag(whichfork); - if (!(iflags & BMV_IF_PREALLOC)) - bmapi_flags |= XFS_BMAPI_IGSTATE; - - /* - * Allocate enough space to handle "subnex" maps at a time. - */ - error = -ENOMEM; - subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); - if (!map) + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + case XFS_DINODE_FMT_LOCAL: + /* Local format inode forks report no extents. */ goto out_unlock_ilock; + default: + error = -EINVAL; + goto out_unlock_ilock; + } - bmv->bmv_entries = 0; - - if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && - (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { - error = 0; - goto out_free_map; + if (bmv->bmv_length == -1) { + max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len)); + bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset); } - do { - nmap = (nex> subnex) ? subnex : nex; - error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), - XFS_BB_TO_FSB(mp, bmv->bmv_length), - map, &nmap, bmapi_flags); - if (error) - goto out_free_map; - ASSERT(nmap <= subnex); - - for (i = 0; i < nmap && bmv->bmv_length && - cur_ext < bmv->bmv_count - 1; i++) { - out[cur_ext].bmv_oflags = 0; - if (map[i].br_state == XFS_EXT_UNWRITTEN) - out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; - else if (map[i].br_startblock == DELAYSTARTBLOCK) - out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; - out[cur_ext].bmv_offset = - XFS_FSB_TO_BB(mp, map[i].br_startoff); - out[cur_ext].bmv_length = - XFS_FSB_TO_BB(mp, map[i].br_blockcount); - out[cur_ext].bmv_unused1 = 0; - out[cur_ext].bmv_unused2 = 0; + bmv_end = bmv->bmv_offset + bmv->bmv_length; - /* - * delayed allocation extents that start beyond EOF can - * occur due to speculative EOF allocation when the - * delalloc extent is larger than the largest freespace - * extent at conversion time. These extents cannot be - * converted by data writeback, so can exist here even - * if we are not supposed to be finding delalloc - * extents. - */ - if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) - ASSERT((iflags & BMV_IF_DELALLOC) != 0); - - if (map[i].br_startblock == HOLESTARTBLOCK && - whichfork == XFS_ATTR_FORK) { - /* came to the end of attribute fork */ - out[cur_ext].bmv_oflags |= BMV_OF_LAST; - goto out_free_map; - } + first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset); + len = XFS_BB_TO_FSB(mp, bmv->bmv_length); - /* Is this a shared block? */ - error = xfs_getbmap_adjust_shared(ip, whichfork, - &map[i], &out[cur_ext], &inject_map); - if (error) - goto out_free_map; + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + goto out_unlock_ilock; + } - if (!xfs_getbmapx_fix_eof_hole(ip, whichfork, - &out[cur_ext], prealloced, bmvend, - map[i].br_startblock, - inject_map.br_startblock != NULLFSBLOCK)) - goto out_free_map; + if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) { + /* + * Report a whole-file hole if the delalloc flag is set to + * stay compatible with the old implementation. + */ + if (iflags & BMV_IF_DELALLOC) + xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno, + XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); + goto out_unlock_ilock; + } - bmv->bmv_offset = - out[cur_ext].bmv_offset + - out[cur_ext].bmv_length; - bmv->bmv_length = - max_t(int64_t, 0, bmvend - bmv->bmv_offset); + while (!xfs_getbmap_full(bmv)) { + xfs_trim_extent(&got, first_bno, len); - /* - * In case we don't want to return the hole, - * don't increase cur_ext so that we can reuse - * it in the next loop. - */ - if ((iflags & BMV_IF_NO_HOLES) && - map[i].br_startblock == HOLESTARTBLOCK) { - memset(&out[cur_ext], 0, sizeof(out[cur_ext])); - continue; - } + /* + * Report an entry for a hole if this extent doesn't directly + * follow the previous one. + */ + if (got.br_startoff > bno) { + xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno, + got.br_startoff); + if (xfs_getbmap_full(bmv)) + break; + } - /* - * In order to report shared extents accurately, - * we report each distinct shared/unshared part - * of a single bmbt record using multiple bmap - * extents. To make that happen, we iterate the - * same map array item multiple times, each - * time trimming out the subextent that we just - * reported. - * - * Because of this, we must check the out array - * index (cur_ext) directly against bmv_count-1 - * to avoid overflows. - */ - if (inject_map.br_startblock != NULLFSBLOCK) { - map[i] = inject_map; - i--; + /* + * In order to report shared extents accurately, we report each + * distinct shared / unshared part of a single bmbt record with + * an individual getbmapx record. + */ + bno = got.br_startoff + got.br_blockcount; + rec = got; + do { + error = xfs_getbmap_report_one(ip, bmv, out, bmv_end, + &rec); + if (error || xfs_getbmap_full(bmv)) + goto out_unlock_ilock; + } while (xfs_getbmap_next_rec(&rec, bno)); + + if (!xfs_iext_next_extent(ifp, &icur, &got)) { + xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + + out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST; + + if (whichfork != XFS_ATTR_FORK && bno < end && + !xfs_getbmap_full(bmv)) { + xfs_getbmap_report_hole(ip, bmv, out, bmv_end, + bno, end); } - bmv->bmv_entries++; - cur_ext++; + break; } - } while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1); - out_free_map: - kmem_free(map); - out_unlock_ilock: - xfs_iunlock(ip, lock); - out_unlock_iolock: - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - for (i = 0; i < cur_ext; i++) { - /* format results & advance arg */ - error = formatter(&arg, &out[i]); - if (error) + if (bno >= first_bno + len) break; } - kmem_free(out); +out_unlock_ilock: + xfs_iunlock(ip, lock); +out_unlock_iolock: + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return error; } @@ -1387,53 +1262,12 @@ out: } -/* - * @next_fsb will keep track of the extent currently undergoing shift. - * @stop_fsb will keep track of the extent at which we have to stop. - * If we are shifting left, we will start with block (offset + len) and - * shift each extent till last extent. - * If we are shifting right, we will start with last extent inside file space - * and continue until we reach the block corresponding to offset. - */ static int -xfs_shift_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len, - enum shift_direction direction) +xfs_prepare_shift( + struct xfs_inode *ip, + loff_t offset) { - int done = 0; - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; int error; - struct xfs_defer_ops dfops; - xfs_fsblock_t first_block; - xfs_fileoff_t stop_fsb; - xfs_fileoff_t next_fsb; - xfs_fileoff_t shift_fsb; - uint resblks; - - ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); - - if (direction == SHIFT_LEFT) { - /* - * Reserve blocks to cover potential extent merges after left - * shift operations. - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - next_fsb = XFS_B_TO_FSB(mp, offset + len); - stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); - } else { - /* - * If right shift, delegate the work of initialization of - * next_fsb to xfs_bmap_shift_extent as it has ilock held. - */ - resblks = 0; - next_fsb = NULLFSBLOCK; - stop_fsb = XFS_B_TO_FSB(mp, offset); - } - - shift_fsb = XFS_B_TO_FSB(mp, len); /* * Trim eofblocks to avoid shifting uninitialized post-eof preallocation @@ -1449,8 +1283,7 @@ xfs_shift_file_space( * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - offset, -1); + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1); if (error) return error; error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, @@ -1470,16 +1303,50 @@ xfs_shift_file_space( return error; } - /* - * The extent shifting code works on extent granularity. So, if - * stop_fsb is not the starting block of extent, we need to split - * the extent at stop_fsb. - */ - if (direction == SHIFT_RIGHT) { - error = xfs_bmap_split_extent(ip, stop_fsb); - if (error) - return error; - } + return 0; +} + +/* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + struct xfs_defer_ops dfops; + xfs_fsblock_t first_block; + xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); + xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); + xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); + uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + bool done = false; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + + trace_xfs_collapse_file_space(ip); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + error = xfs_prepare_shift(ip, offset); + if (error) + return error; while (!error && !done) { error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, @@ -1493,25 +1360,17 @@ xfs_shift_file_space( XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_defer_init(&dfops, &first_block); - - /* - * We are using the write transaction in which max 2 bmbt - * updates are allowed - */ - error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb, - &done, stop_fsb, &first_block, &dfops, - direction, XFS_BMAP_MAX_SHIFT_EXTENTS); + error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, + &done, stop_fsb, &first_block, &dfops); if (error) goto out_bmap_cancel; error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp); } @@ -1525,36 +1384,6 @@ out_trans_cancel: } /* - * xfs_collapse_file_space() - * This routine frees disk space and shift extent for the given file. - * The first thing we do is to free data blocks in the specified range - * by calling xfs_free_file_space(). It would also sync dirty data - * and invalidate page cache over the region on which collapse range - * is working. And Shift extent records to the left to cover a hole. - * RETURNS: - * 0 on success - * errno on error - * - */ -int -xfs_collapse_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len) -{ - int error; - - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - trace_xfs_collapse_file_space(ip); - - error = xfs_free_file_space(ip, offset, len); - if (error) - return error; - - return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT); -} - -/* * xfs_insert_file_space() * This routine create hole space by shifting extents for the given file. * The first thing we do is to sync dirty data and invalidate page cache @@ -1572,10 +1401,60 @@ xfs_insert_file_space( loff_t offset, loff_t len) { + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + struct xfs_defer_ops dfops; + xfs_fsblock_t first_block; + xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, offset); + xfs_fileoff_t next_fsb = NULLFSBLOCK; + xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); + bool done = false; + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + trace_xfs_insert_file_space(ip); - return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT); + error = xfs_prepare_shift(ip, offset); + if (error) + return error; + + /* + * The extent shifting code works on extent granularity. So, if stop_fsb + * is not the starting block of extent, we need to split the extent at + * stop_fsb. + */ + error = xfs_bmap_split_extent(ip, stop_fsb); + if (error) + return error; + + while (!error && !done) { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, + &tp); + if (error) + break; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_defer_init(&dfops, &first_block); + error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb, + &done, stop_fsb, &first_block, &dfops); + if (error) + goto out_bmap_cancel; + + error = xfs_defer_finish(&tp, &dfops); + if (error) + goto out_bmap_cancel; + error = xfs_trans_commit(tp); + } + + return error; + +out_bmap_cancel: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + return error; } /* @@ -1830,7 +1709,6 @@ xfs_swap_extent_forks( xfs_filblks_t aforkblks = 0; xfs_filblks_t taforkblks = 0; xfs_extnum_t junk; - xfs_extnum_t nextents; uint64_t tmp; int error; @@ -1905,13 +1783,6 @@ xfs_swap_extent_forks( switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* - * If the extents fit in the inode, fix the pointer. Otherwise - * it's already NULL or pointing to the extent. - */ - nextents = xfs_iext_count(&ip->i_df); - if (nextents <= XFS_INLINE_EXTS) - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: @@ -1923,13 +1794,6 @@ xfs_swap_extent_forks( switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* - * If the extents fit in the inode, fix the pointer. Otherwise - * it's already NULL or pointing to the extent. - */ - nextents = xfs_iext_count(&tip->i_df); - if (nextents <= XFS_INLINE_EXTS) - tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 0eaa81dc49be..4d4ae48bd4f6 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -28,16 +28,33 @@ struct xfs_mount; struct xfs_trans; struct xfs_bmalloca; +#ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); +#else /* !CONFIG_XFS_RT */ +/* + * Attempts to allocate RT extents when RT is disable indicates corruption and + * should trigger a shutdown. + */ +static inline int +xfs_bmap_rtalloc(struct xfs_bmalloca *ap) +{ + return -EFSCORRUPTED; +} +#endif /* CONFIG_XFS_RT */ + int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); -/* bmap to userspace formatter - copy to user & advance pointer */ -typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *); +struct kgetbmap { + __s64 bmv_offset; /* file offset of segment in blocks */ + __s64 bmv_block; /* starting block (64-bit daddr_t) */ + __s64 bmv_length; /* length of segment, blocks */ + __s32 bmv_oflags; /* output flags */ +}; int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, - xfs_bmap_format_t formatter, void *arg); + struct kgetbmap *out); /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 2f97c12ca75e..4db6e8d780f6 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -42,6 +42,8 @@ #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_errortag.h" +#include "xfs_error.h" static kmem_zone_t *xfs_buf_zone; @@ -2129,3 +2131,17 @@ xfs_buf_terminate(void) { kmem_zone_destroy(xfs_buf_zone); } + +void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) +{ + /* + * Set the lru reference count to 0 based on the error injection tag. + * This allows userspace to disrupt buffer caching for debug/testing + * purposes. + */ + if (XFS_TEST_ERROR(false, bp->b_target->bt_mount, + XFS_ERRTAG_BUF_LRU_REF)) + lru_ref = 0; + + atomic_set(&bp->b_lru_ref, lru_ref); +} diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index bf71507ddb16..f873bb786824 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -352,10 +352,7 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) -static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) -{ - atomic_set(&bp->b_lru_ref, lru_ref); -} +void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); static inline int xfs_buf_ispinned(struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index ba2638d37031..0c58918bc0ad 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -41,7 +41,7 @@ static unsigned char xfs_dir3_filetype_table[] = { DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, }; -static unsigned char +unsigned char xfs_dir3_get_dtype( struct xfs_mount *mp, uint8_t filetype) @@ -266,7 +266,7 @@ xfs_dir2_leaf_readbuf( xfs_dablk_t next_ra; xfs_dablk_t map_off; xfs_dablk_t last_da; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; int ra_want; int error = 0; @@ -283,7 +283,7 @@ xfs_dir2_leaf_readbuf( */ last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off)); - if (!xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map)) + if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map)) goto out; if (map.br_startoff >= last_da) goto out; @@ -311,7 +311,7 @@ xfs_dir2_leaf_readbuf( if (next_ra >= last_da) goto out_no_ra; if (map.br_blockcount < geo->fsbcount && - !xfs_iext_get_extent(ifp, ++idx, &map)) + !xfs_iext_next_extent(ifp, &icur, &map)) goto out_no_ra; if (map.br_startoff >= last_da) goto out_no_ra; @@ -334,7 +334,7 @@ xfs_dir2_leaf_readbuf( ra_want -= geo->fsbcount; next_ra += geo->fsbcount; } - if (!xfs_iext_get_extent(ifp, ++idx, &map)) { + if (!xfs_iext_next_extent(ifp, &icur, &map)) { *ra_blk = last_da; break; } diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h index 0f070f9e44e1..de92d9cc958f 100644 --- a/fs/xfs/xfs_discard.h +++ b/fs/xfs/xfs_discard.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef XFS_DISCARD_H #define XFS_DISCARD_H 1 diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index cd82429d8df7..d57c2db64e59 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -53,13 +53,6 @@ * otherwise by the lowest id first, see xfs_dqlock2. */ -#ifdef DEBUG -xfs_buftarg_t *xfs_dqerror_target; -int xfs_do_dqerror; -int xfs_dqreq_num; -int xfs_dqerror_mod = 33; -#endif - struct kmem_zone *xfs_qm_dqtrxzone; static struct kmem_zone *xfs_qm_dqzone; @@ -703,7 +696,7 @@ xfs_dq_get_next_id( xfs_dqid_t next_id = *id + 1; /* simple advance */ uint lock_flags; struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor cur; xfs_fsblock_t start; int error = 0; @@ -727,7 +720,7 @@ xfs_dq_get_next_id( return error; } - if (xfs_iext_lookup_extent(quotip, "ip->i_df, start, &idx, &got)) { + if (xfs_iext_lookup_extent(quotip, "ip->i_df, start, &cur, &got)) { /* contiguous chunk, bump startoff for the id calculation */ if (got.br_startoff < start) got.br_startoff = start; @@ -770,15 +763,6 @@ xfs_qm_dqget( return -ESRCH; } -#ifdef DEBUG - if (xfs_do_dqerror) { - if ((xfs_dqerror_target == mp->m_ddev_targp) && - (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { - xfs_debug(mp, "Returning error in dqget"); - return -EIO; - } - } - ASSERT(type == XFS_DQ_USER || type == XFS_DQ_PROJ || type == XFS_DQ_GROUP); @@ -786,7 +770,6 @@ xfs_qm_dqget( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(xfs_inode_dquot(ip, type) == NULL); } -#endif restart: mutex_lock(&qi->qi_tree_lock); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index eaf86f55b7f2..4c9f35d983b2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -21,6 +21,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" @@ -58,6 +59,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_DROP_WRITES, XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, + XFS_RANDOM_BUF_LRU_REF, }; struct xfs_errortag_attr { @@ -163,6 +165,7 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); +XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -196,10 +199,11 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), + XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), NULL, }; -struct kobj_type xfs_errortag_ktype = { +static struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_errortag_sysfs_ops, .default_attrs = xfs_errortag_attrs, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 7c4bef3bddb7..ea816c1bf8db 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -63,87 +63,6 @@ extern void xfs_verifier_error(struct xfs_buf *bp); } \ } -/* - * error injection tags - the labels can be anything you want - * but each tag should have its own unique number - */ - -#define XFS_ERRTAG_NOERROR 0 -#define XFS_ERRTAG_IFLUSH_1 1 -#define XFS_ERRTAG_IFLUSH_2 2 -#define XFS_ERRTAG_IFLUSH_3 3 -#define XFS_ERRTAG_IFLUSH_4 4 -#define XFS_ERRTAG_IFLUSH_5 5 -#define XFS_ERRTAG_IFLUSH_6 6 -#define XFS_ERRTAG_DA_READ_BUF 7 -#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8 -#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9 -#define XFS_ERRTAG_ALLOC_READ_AGF 10 -#define XFS_ERRTAG_IALLOC_READ_AGI 11 -#define XFS_ERRTAG_ITOBP_INOTOBP 12 -#define XFS_ERRTAG_IUNLINK 13 -#define XFS_ERRTAG_IUNLINK_REMOVE 14 -#define XFS_ERRTAG_DIR_INO_VALIDATE 15 -#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16 -#define XFS_ERRTAG_IODONE_IOERR 17 -#define XFS_ERRTAG_STRATREAD_IOERR 18 -#define XFS_ERRTAG_STRATCMPL_IOERR 19 -#define XFS_ERRTAG_DIOWRITE_IOERR 20 -#define XFS_ERRTAG_BMAPIFORMAT 21 -#define XFS_ERRTAG_FREE_EXTENT 22 -#define XFS_ERRTAG_RMAP_FINISH_ONE 23 -#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24 -#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 -#define XFS_ERRTAG_BMAP_FINISH_ONE 26 -#define XFS_ERRTAG_AG_RESV_CRITICAL 27 -/* - * DEBUG mode instrumentation to test and/or trigger delayed allocation - * block killing in the event of failed writes. When enabled, all - * buffered writes are silenty dropped and handled as if they failed. - * All delalloc blocks in the range of the write (including pre-existing - * delalloc blocks!) are tossed as part of the write failure error - * handling sequence. - */ -#define XFS_ERRTAG_DROP_WRITES 28 -#define XFS_ERRTAG_LOG_BAD_CRC 29 -#define XFS_ERRTAG_LOG_ITEM_PIN 30 -#define XFS_ERRTAG_MAX 31 - -/* - * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. - */ -#define XFS_RANDOM_DEFAULT 100 -#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) -#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#define XFS_RANDOM_FREE_EXTENT 1 -#define XFS_RANDOM_RMAP_FINISH_ONE 1 -#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 -#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 -#define XFS_RANDOM_BMAP_FINISH_ONE 1 -#define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_DROP_WRITES 1 -#define XFS_RANDOM_LOG_BAD_CRC 1 -#define XFS_RANDOM_LOG_ITEM_PIN 1 - #ifdef DEBUG extern int xfs_errortag_init(struct xfs_mount *mp); extern void xfs_errortag_del(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 309e26c9dddb..18146873a8b3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -237,11 +237,13 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) return -EAGAIN; + } else { xfs_ilock(ip, XFS_IOLOCK_SHARED); } + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -259,9 +261,10 @@ xfs_file_buffered_aio_read( trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) return -EAGAIN; + } else { xfs_ilock(ip, XFS_IOLOCK_SHARED); } ret = generic_file_read_iter(iocb, to); @@ -552,9 +555,10 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - if (!xfs_ilock_nowait(ip, iolock)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, iolock)) return -EAGAIN; + } else { xfs_ilock(ip, iolock); } @@ -606,9 +610,10 @@ xfs_file_dax_write( size_t count; loff_t pos; - if (!xfs_ilock_nowait(ip, iolock)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, iolock)) return -EAGAIN; + } else { xfs_ilock(ip, iolock); } @@ -764,7 +769,7 @@ xfs_file_fallocate( enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; - bool do_file_insert = 0; + bool do_file_insert = false; if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -825,7 +830,7 @@ xfs_file_fallocate( error = -EINVAL; goto out_unlock; } - do_file_insert = 1; + do_file_insert = true; } else { flags |= XFS_PREALLOC_SET; @@ -979,7 +984,7 @@ xfs_file_readdir( * point we can change the ->readdir prototype to include the * buffer size. For now we use the current glibc buffer size. */ - bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size); return xfs_readdir(NULL, ip, ctx, bufsize); } diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 814ed729881d..43cfc07996a4 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -367,29 +367,6 @@ xfs_getfsmap_datadev_helper( return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); } -/* Transform a rtbitmap "record" into a fsmap */ -STATIC int -xfs_getfsmap_rtdev_rtbitmap_helper( - struct xfs_trans *tp, - struct xfs_rtalloc_rec *rec, - void *priv) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_getfsmap_info *info = priv; - struct xfs_rmap_irec irec; - xfs_daddr_t rec_daddr; - - rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock); - - irec.rm_startblock = rec->ar_startblock; - irec.rm_blockcount = rec->ar_blockcount; - irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ - irec.rm_offset = 0; - irec.rm_flags = 0; - - return xfs_getfsmap_helper(tp, info, &irec, rec_daddr); -} - /* Transform a bnobt irec into a fsmap */ STATIC int xfs_getfsmap_datadev_bnobt_helper( @@ -475,6 +452,30 @@ xfs_getfsmap_logdev( return xfs_getfsmap_helper(tp, info, &rmap, 0); } +#ifdef CONFIG_XFS_RT +/* Transform a rtbitmap "record" into a fsmap */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_getfsmap_info *info = priv; + struct xfs_rmap_irec irec; + xfs_daddr_t rec_daddr; + + rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock); + + irec.rm_startblock = rec->ar_startblock; + irec.rm_blockcount = rec->ar_blockcount; + irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ + irec.rm_offset = 0; + irec.rm_flags = 0; + + return xfs_getfsmap_helper(tp, info, &irec, rec_daddr); +} + /* Execute a getfsmap query against the realtime device. */ STATIC int __xfs_getfsmap_rtdev( @@ -561,6 +562,7 @@ xfs_getfsmap_rtdev_rtbitmap( return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query, info); } +#endif /* CONFIG_XFS_RT */ /* Execute a getfsmap query against the regular data device. */ STATIC int @@ -795,7 +797,15 @@ xfs_getfsmap_check_keys( return false; } +/* + * There are only two devices if we didn't configure RT devices at build time. + */ +#ifdef CONFIG_XFS_RT #define XFS_GETFSMAP_DEVS 3 +#else +#define XFS_GETFSMAP_DEVS 2 +#endif /* CONFIG_XFS_RT */ + /* * Get filesystem's extents as described in head, and format for * output. Calls formatter to fill the user's buffer until all @@ -853,10 +863,12 @@ xfs_getfsmap( handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); handlers[1].fn = xfs_getfsmap_logdev; } +#ifdef CONFIG_XFS_RT if (mp->m_rtdev_targp) { handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; } +#endif /* CONFIG_XFS_RT */ xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev), xfs_getfsmap_dev_compare); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 34227115a5d6..43005fbe8b1e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -610,7 +610,7 @@ again: } else { rcu_read_unlock(); if (flags & XFS_IGET_INCORE) { - error = -ENOENT; + error = -ENODATA; goto out_error_or_again; } XFS_STATS_INC(mp, xs_ig_missed); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4ec5b7f45401..d8226f7a5dde 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -39,6 +39,7 @@ #include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_filestream.h" @@ -384,14 +385,6 @@ xfs_isilocked( } #endif -#ifdef DEBUG -int xfs_locked_n; -int xfs_small_retries; -int xfs_middle_retries; -int xfs_lots_retries; -int xfs_lock_delays; -#endif - /* * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined @@ -544,24 +537,11 @@ again: if ((attempts % 5) == 0) { delay(1); /* Don't just spin the CPU */ -#ifdef DEBUG - xfs_lock_delays++; -#endif } i = 0; try_lock = 0; goto again; } - -#ifdef DEBUG - if (attempts) { - if (attempts < 5) xfs_small_retries++; - else if (attempts < 100) xfs_middle_retries++; - else xfs_lots_retries++; - } else { - xfs_locked_n++; - } -#endif } /* @@ -767,7 +747,7 @@ xfs_ialloc( xfs_inode_t *pip, umode_t mode, xfs_nlink_t nlink, - xfs_dev_t rdev, + dev_t rdev, prid_t prid, int okalloc, xfs_buf_t **ialloc_context, @@ -819,6 +799,7 @@ xfs_ialloc( set_nlink(inode, nlink); ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); + inode->i_rdev = rdev; xfs_set_projid(ip, prid); if (pip && XFS_INHERIT_GID(pip)) { @@ -867,7 +848,6 @@ xfs_ialloc( case S_IFBLK: case S_IFSOCK: ip->i_d.di_format = XFS_DINODE_FMT_DEV; - ip->i_df.if_u2.if_rdev = rdev; ip->i_df.if_flags = 0; flags |= XFS_ILOG_DEV; break; @@ -933,7 +913,7 @@ xfs_ialloc( ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; - ip->i_df.if_u1.if_extents = NULL; + ip->i_df.if_u1.if_root = NULL; break; default: ASSERT(0); @@ -975,7 +955,7 @@ xfs_dir_ialloc( the inode. */ umode_t mode, xfs_nlink_t nlink, - xfs_dev_t rdev, + dev_t rdev, prid_t prid, /* project id */ int okalloc, /* ok to allocate new space */ xfs_inode_t **ipp, /* pointer to inode; it will be @@ -1147,7 +1127,7 @@ xfs_create( xfs_inode_t *dp, struct xfs_name *name, umode_t mode, - xfs_dev_t rdev, + dev_t rdev, xfs_inode_t **ipp) { int is_dir = S_ISDIR(mode); @@ -1183,7 +1163,6 @@ xfs_create( return error; if (is_dir) { - rdev = 0; resblks = XFS_MKDIR_SPACE_RES(mp, name->len); tres = &M_RES(mp)->tr_mkdir; } else { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0ee453de239a..cc13c3763721 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -391,7 +391,7 @@ void xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct xfs_inode *dp, struct xfs_name *name, - umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); + umode_t mode, dev_t rdev, struct xfs_inode **ipp); int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, @@ -428,7 +428,7 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, - xfs_nlink_t, xfs_dev_t, prid_t, int, + xfs_nlink_t, dev_t, prid_t, int, struct xfs_inode **, int *); /* from xfs_file.c */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a705f34b58fa..6ee5c3bf19ad 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -72,7 +72,6 @@ xfs_inode_item_data_fork_size( break; case XFS_DINODE_FMT_DEV: - case XFS_DINODE_FMT_UUID: break; default: ASSERT(0); @@ -156,15 +155,13 @@ xfs_inode_item_format_data_fork( switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID); + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DEXT) && ip->i_d.di_nextents > 0 && ip->i_df.if_bytes > 0) { struct xfs_bmbt_rec *p; - ASSERT(ip->i_df.if_u1.if_extents != NULL); ASSERT(xfs_iext_count(&ip->i_df) > 0); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); @@ -181,8 +178,7 @@ xfs_inode_item_format_data_fork( break; case XFS_DINODE_FMT_BTREE: iip->ili_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID); + ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { @@ -200,8 +196,7 @@ xfs_inode_item_format_data_fork( break; case XFS_DINODE_FMT_LOCAL: iip->ili_fields &= - ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID); + ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { /* @@ -224,17 +219,9 @@ xfs_inode_item_format_data_fork( break; case XFS_DINODE_FMT_DEV: iip->ili_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEXT | XFS_ILOG_UUID); + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT); if (iip->ili_fields & XFS_ILOG_DEV) - ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; - break; - case XFS_DINODE_FMT_UUID: - iip->ili_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEXT | XFS_ILOG_DEV); - if (iip->ili_fields & XFS_ILOG_UUID) - ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; + ilf->ilf_u.ilfu_rdev = sysv_encode_dev(VFS_I(ip)->i_rdev); break; default: ASSERT(0); @@ -264,7 +251,6 @@ xfs_inode_item_format_attr_fork( ASSERT(xfs_iext_count(ip->i_afp) == ip->i_d.di_anextents); - ASSERT(ip->i_afp->if_u1.if_extents != NULL); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); @@ -364,6 +350,9 @@ xfs_inode_to_log_dinode( to->di_dmstate = from->di_dmstate; to->di_flags = from->di_flags; + /* log a dummy value to ensure log structure is fully initialised */ + to->di_next_unlinked = NULLAGINO; + if (from->di_version == 3) { to->di_changecount = inode->i_version; to->di_crtime.t_sec = from->di_crtime.t_sec; @@ -404,6 +393,11 @@ xfs_inode_item_format_core( * the second with the on-disk inode structure, and a possible third and/or * fourth with the inode data/extents/b-tree root and inode attributes * data/extents/b-tree root. + * + * Note: Always use the 64 bit inode log format structure so we don't + * leave an uninitialised hole in the format item on 64 bit systems. Log + * recovery on 32 bit systems handles this just fine, so there's no reason + * for not using an initialising the properly padded structure all the time. */ STATIC void xfs_inode_item_format( @@ -412,8 +406,8 @@ xfs_inode_item_format( { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - struct xfs_inode_log_format *ilf; struct xfs_log_iovec *vecp = NULL; + struct xfs_inode_log_format *ilf; ASSERT(ip->i_d.di_version > 1); @@ -425,7 +419,17 @@ xfs_inode_item_format( ilf->ilf_boffset = ip->i_imap.im_boffset; ilf->ilf_fields = XFS_ILOG_CORE; ilf->ilf_size = 2; /* format + core */ - xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + /* + * make sure we don't leak uninitialised data into the log in the case + * when we don't log every field in the inode. + */ + ilf->ilf_dsize = 0; + ilf->ilf_asize = 0; + ilf->ilf_pad = 0; + memset(&ilf->ilf_u, 0, sizeof(ilf->ilf_u)); + + xlog_finish_iovec(lv, vecp, sizeof(*ilf)); xfs_inode_item_format_core(ip, lv, &vecp); xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); @@ -855,44 +859,28 @@ xfs_istale_done( } /* - * convert an xfs_inode_log_format struct from either 32 or 64 bit versions - * (which can have different field alignments) to the native version + * convert an xfs_inode_log_format struct from the old 32 bit version + * (which can have different field alignments) to the native 64 bit version */ int xfs_inode_item_format_convert( - xfs_log_iovec_t *buf, - xfs_inode_log_format_t *in_f) + struct xfs_log_iovec *buf, + struct xfs_inode_log_format *in_f) { - if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { - xfs_inode_log_format_32_t *in_f32 = buf->i_addr; - - in_f->ilf_type = in_f32->ilf_type; - in_f->ilf_size = in_f32->ilf_size; - in_f->ilf_fields = in_f32->ilf_fields; - in_f->ilf_asize = in_f32->ilf_asize; - in_f->ilf_dsize = in_f32->ilf_dsize; - in_f->ilf_ino = in_f32->ilf_ino; - /* copy biggest field of ilf_u */ - uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid); - in_f->ilf_blkno = in_f32->ilf_blkno; - in_f->ilf_len = in_f32->ilf_len; - in_f->ilf_boffset = in_f32->ilf_boffset; - return 0; - } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ - xfs_inode_log_format_64_t *in_f64 = buf->i_addr; - - in_f->ilf_type = in_f64->ilf_type; - in_f->ilf_size = in_f64->ilf_size; - in_f->ilf_fields = in_f64->ilf_fields; - in_f->ilf_asize = in_f64->ilf_asize; - in_f->ilf_dsize = in_f64->ilf_dsize; - in_f->ilf_ino = in_f64->ilf_ino; - /* copy biggest field of ilf_u */ - uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid); - in_f->ilf_blkno = in_f64->ilf_blkno; - in_f->ilf_len = in_f64->ilf_len; - in_f->ilf_boffset = in_f64->ilf_boffset; - return 0; - } - return -EFSCORRUPTED; + struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; + + if (buf->i_len != sizeof(*in_f32)) + return -EFSCORRUPTED; + + in_f->ilf_type = in_f32->ilf_type; + in_f->ilf_size = in_f32->ilf_size; + in_f->ilf_fields = in_f32->ilf_fields; + in_f->ilf_asize = in_f32->ilf_asize; + in_f->ilf_dsize = in_f32->ilf_dsize; + in_f->ilf_ino = in_f32->ilf_ino; + memcpy(&in_f->ilf_u, &in_f32->ilf_u, sizeof(in_f->ilf_u)); + in_f->ilf_blkno = in_f32->ilf_blkno; + in_f->ilf_len = in_f32->ilf_len; + in_f->ilf_boffset = in_f32->ilf_boffset; + return 0; } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 4c7722e325b3..b72373a33cd9 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -48,7 +48,7 @@ extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_iflush_abort(struct xfs_inode *, bool); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, - xfs_inode_log_format_t *); + struct xfs_inode_log_format *); extern struct kmem_zone *xfs_ili_zone; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index aa75389be8cf..20dc65fef6a4 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -44,6 +44,7 @@ #include "xfs_btree.h" #include <linux/fsmap.h> #include "xfs_fsmap.h" +#include "scrub/xfs_scrub.h" #include <linux/capability.h> #include <linux/cred.h> @@ -310,8 +311,8 @@ xfs_readlink_by_handle( int xfs_set_dmattrs( xfs_inode_t *ip, - u_int evmask, - u_int16_t state) + uint evmask, + uint16_t state) { xfs_mount_t *mp = ip->i_mount; xfs_trans_t *tp; @@ -1201,6 +1202,8 @@ out_unlock: * 8. for non-realtime files, the extent size hint must be limited * to half the AG size to avoid alignment extending the extent beyond the * limits of the AG. + * + * Please keep this function in sync with xfs_scrub_inode_extsize. */ static int xfs_ioctl_setattr_check_extsize( @@ -1257,6 +1260,8 @@ xfs_ioctl_setattr_check_extsize( * 5. Extent size must be a multiple of the appropriate block size. * 6. The extent size hint must be limited to half the AG size to avoid * alignment extending the extent beyond the limits of the AG. + * + * Please keep this function in sync with xfs_scrub_inode_cowextsize. */ static int xfs_ioctl_setattr_check_cowextsize( @@ -1540,17 +1545,26 @@ out_drop_write: return error; } -STATIC int -xfs_getbmap_format(void **ap, struct getbmapx *bmv) +static bool +xfs_getbmap_format( + struct kgetbmap *p, + struct getbmapx __user *u, + size_t recsize) { - struct getbmap __user *base = (struct getbmap __user *)*ap; - - /* copy only getbmap portion (not getbmapx) */ - if (copy_to_user(base, bmv, sizeof(struct getbmap))) - return -EFAULT; - - *ap += sizeof(struct getbmap); - return 0; + if (put_user(p->bmv_offset, &u->bmv_offset) || + put_user(p->bmv_block, &u->bmv_block) || + put_user(p->bmv_length, &u->bmv_length) || + put_user(0, &u->bmv_count) || + put_user(0, &u->bmv_entries)) + return false; + if (recsize < sizeof(struct getbmapx)) + return true; + if (put_user(0, &u->bmv_iflags) || + put_user(p->bmv_oflags, &u->bmv_oflags) || + put_user(0, &u->bmv_unused1) || + put_user(0, &u->bmv_unused2)) + return false; + return true; } STATIC int @@ -1560,68 +1574,57 @@ xfs_ioc_getbmap( void __user *arg) { struct getbmapx bmx = { 0 }; - int error; - - /* struct getbmap is a strict subset of struct getbmapx. */ - if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags))) - return -EFAULT; + struct kgetbmap *buf; + size_t recsize; + int error, i; - if (bmx.bmv_count < 2) + switch (cmd) { + case XFS_IOC_GETBMAPA: + bmx.bmv_iflags = BMV_IF_ATTRFORK; + /*FALLTHRU*/ + case XFS_IOC_GETBMAP: + if (file->f_mode & FMODE_NOCMTIME) + bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; + /* struct getbmap is a strict subset of struct getbmapx. */ + recsize = sizeof(struct getbmap); + break; + case XFS_IOC_GETBMAPX: + recsize = sizeof(struct getbmapx); + break; + default: return -EINVAL; + } - bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (file->f_mode & FMODE_NOCMTIME) - bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; - - error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format, - (__force struct getbmap *)arg+1); - if (error) - return error; - - /* copy back header - only size of getbmap */ - if (copy_to_user(arg, &bmx, sizeof(struct getbmap))) - return -EFAULT; - return 0; -} - -STATIC int -xfs_getbmapx_format(void **ap, struct getbmapx *bmv) -{ - struct getbmapx __user *base = (struct getbmapx __user *)*ap; - - if (copy_to_user(base, bmv, sizeof(struct getbmapx))) - return -EFAULT; - - *ap += sizeof(struct getbmapx); - return 0; -} - -STATIC int -xfs_ioc_getbmapx( - struct xfs_inode *ip, - void __user *arg) -{ - struct getbmapx bmx; - int error; - - if (copy_from_user(&bmx, arg, sizeof(bmx))) + if (copy_from_user(&bmx, arg, recsize)) return -EFAULT; if (bmx.bmv_count < 2) return -EINVAL; + if (bmx.bmv_count > ULONG_MAX / recsize) + return -ENOMEM; - if (bmx.bmv_iflags & (~BMV_IF_VALID)) - return -EINVAL; + buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0); + if (!buf) + return -ENOMEM; - error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, - (__force struct getbmapx *)arg+1); + error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, buf); if (error) - return error; + goto out_free_buf; - /* copy back header */ - if (copy_to_user(arg, &bmx, sizeof(struct getbmapx))) - return -EFAULT; + error = -EFAULT; + if (copy_to_user(arg, &bmx, recsize)) + goto out_free_buf; + arg += recsize; + + for (i = 0; i < bmx.bmv_entries; i++) { + if (!xfs_getbmap_format(buf + i, arg, recsize)) + goto out_free_buf; + arg += recsize; + } + error = 0; +out_free_buf: + kmem_free(buf); return 0; } @@ -1703,6 +1706,30 @@ xfs_ioc_getfsmap( return 0; } +STATIC int +xfs_ioc_scrub_metadata( + struct xfs_inode *ip, + void __user *arg) +{ + struct xfs_scrub_metadata scrub; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&scrub, arg, sizeof(scrub))) + return -EFAULT; + + error = xfs_scrub_metadata(ip, &scrub); + if (error) + return error; + + if (copy_to_user(arg, &scrub, sizeof(scrub))) + return -EFAULT; + + return 0; +} + int xfs_ioc_swapext( xfs_swapext_t *sxp) @@ -1878,14 +1905,15 @@ xfs_file_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(filp, cmd, arg); - case XFS_IOC_GETBMAPX: - return xfs_ioc_getbmapx(ip, arg); + return xfs_ioc_getbmap(filp, cmd, arg); case FS_IOC_GETFSMAP: return xfs_ioc_getfsmap(ip, arg); + case XFS_IOC_SCRUB_METADATA: + return xfs_ioc_scrub_metadata(ip, arg); + case XFS_IOC_FD_TO_HANDLE: case XFS_IOC_PATH_TO_HANDLE: case XFS_IOC_PATH_TO_FSHANDLE: { diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index e86c3ea137d2..8de879f0c7d5 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -86,7 +86,7 @@ xfs_file_compat_ioctl( extern int xfs_set_dmattrs( struct xfs_inode *ip, - u_int evmask, - u_int16_t state); + uint evmask, + uint16_t state); #endif diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index fa0bc4d46065..35c79e246fde 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -556,6 +556,7 @@ xfs_file_compat_ioctl( case XFS_IOC_ERROR_INJECTION: case XFS_IOC_ERROR_CLEARALL: case FS_IOC_GETFSMAP: + case XFS_IOC_SCRUB_METADATA: return xfs_file_ioctl(filp, cmd, p); #ifndef BROKEN_X86_ALIGNMENT /* These are handled fine if no alignment issues */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f179bdf1644d..18077e2189a9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -30,6 +30,7 @@ #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_space.h" @@ -54,13 +55,13 @@ xfs_bmbt_to_iomap( struct xfs_mount *mp = ip->i_mount; if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; + iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; + iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { - iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock)); if (imap->br_state == XFS_EXT_UNWRITTEN) iomap->type = IOMAP_UNWRITTEN; else @@ -389,7 +390,7 @@ xfs_iomap_prealloc_size( struct xfs_inode *ip, loff_t offset, loff_t count, - xfs_extnum_t idx) + struct xfs_iext_cursor *icur) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); @@ -414,7 +415,7 @@ xfs_iomap_prealloc_size( */ if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || - !xfs_iext_get_extent(ifp, idx - 1, &prev) || + !xfs_iext_peek_prev_extent(ifp, icur, &prev) || prev.br_startoff + prev.br_blockcount < offset_fsb) return mp->m_writeio_blocks; @@ -532,7 +533,7 @@ xfs_file_iomap_begin_delay( xfs_fileoff_t end_fsb; int error = 0, eof = 0; struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; xfs_fsblock_t prealloc_blocks = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); @@ -557,7 +558,7 @@ xfs_file_iomap_begin_delay( goto out_unlock; } - eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); + eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); if (!eof && got.br_startoff <= offset_fsb) { if (xfs_is_reflink_inode(ip)) { bool shared; @@ -591,7 +592,8 @@ xfs_file_iomap_begin_delay( end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx); + prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, + &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -613,7 +615,8 @@ xfs_file_iomap_begin_delay( retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof); + end_fsb - offset_fsb, prealloc_blocks, &got, &icur, + eof); switch (error) { case 0: break; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 17081c77ef86..56475fcd76f2 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -160,7 +160,6 @@ xfs_generic_create( if (S_ISCHR(mode) || S_ISBLK(mode)) { if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) return -EINVAL; - rdev = sysv_encode_dev(rdev); } else { rdev = 0; } @@ -535,8 +534,7 @@ xfs_vn_getattr( case S_IFBLK: case S_IFCHR: stat->blksize = BLKDEV_IOSIZE; - stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, - sysv_minor(ip->i_df.if_u2.if_rdev)); + stat->rdev = inode->i_rdev; break; default: if (XFS_IS_REALTIME_INODE(ip)) { @@ -886,22 +884,6 @@ xfs_setattr_size( return error; /* - * We are going to log the inode size change in this transaction so - * any previous writes that are beyond the on disk EOF and the new - * EOF that have not been written out need to be written here. If we - * do not write the data out, we expose ourselves to the null files - * problem. Note that this includes any block zeroing we did above; - * otherwise those blocks may not be zeroed after a crash. - */ - if (did_zeroing || - (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - ip->i_d.di_size, newsize); - if (error) - return error; - } - - /* * We've already locked out new page faults, so now we can safely remove * pages from the page cache knowing they won't get refaulted until we * drop the XFS_MMAP_EXCL lock after the extent manipulations are @@ -917,9 +899,29 @@ xfs_setattr_size( * user visible changes). There's not much we can do about this, except * to hope that the caller sees ENOMEM and retries the truncate * operation. + * + * And we update in-core i_size and truncate page cache beyond newsize + * before writeback the [di_size, newsize] range, so we're guaranteed + * not to write stale data past the new EOF on truncate down. */ truncate_setsize(inode, newsize); + /* + * We are going to log the inode size change in this transaction so + * any previous writes that are beyond the on disk EOF and the new + * EOF that have not been written out need to be written here. If we + * do not write the data out, we expose ourselves to the null files + * problem. Note that this includes any block zeroing we did above; + * otherwise those blocks may not be zeroed after a crash. + */ + if (did_zeroing || + (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize - 1); + if (error) + return error; + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) return error; @@ -1231,18 +1233,6 @@ xfs_setup_inode( inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); - switch (inode->i_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - inode->i_rdev = - MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, - sysv_minor(ip->i_df.if_u2.if_rdev)); - break; - default: - inode->i_rdev = 0; - break; - } - i_size_write(inode, ip->i_d.di_size); xfs_diflags_to_iflags(inode, ip); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c393a2f6d8c3..d58310514423 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -31,16 +31,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" -int -xfs_internal_inum( - xfs_mount_t *mp, - xfs_ino_t ino) -{ - return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || - (xfs_sb_version_hasquota(&mp->m_sb) && - xfs_is_quota_inode(&mp->m_sb, ino))); -} - /* * Return stat information for one inode. * Return 0 if ok, else errno. @@ -119,12 +109,11 @@ xfs_bulkstat_one_int( switch (dic->di_format) { case XFS_DINODE_FMT_DEV: - buf->bs_rdev = ip->i_df.if_u2.if_rdev; + buf->bs_rdev = sysv_encode_dev(inode->i_rdev); buf->bs_blksize = BLKDEV_IOSIZE; buf->bs_blocks = 0; break; case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_UUID: buf->bs_rdev = 0; buf->bs_blksize = mp->m_sb.sb_blocksize; buf->bs_blocks = 0; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 17e86e0541af..6ea8b3912fa4 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -96,6 +96,4 @@ xfs_inumbers( void __user *buffer, /* buffer with inode info */ inumbers_fmt_pf formatter); -int xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); - #endif /* __XFS_ITABLE_H__ */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index dcd1292664b3..6282bfc1afa9 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -142,6 +142,13 @@ typedef __u32 xfs_nlink_t; #define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) +/* + * Return the address of a label. Use barrier() so that the optimizer + * won't reorder code to refactor the error jumpouts into a single + * return, which throws off the reported address. + */ +#define __this_address ({ __label__ __here; __here: barrier(); &&__here; }) + #define XFS_PROJID_DEFAULT 0 #define MIN(a,b) (min(a,b)) @@ -243,10 +250,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) #define ASSERT(expr) \ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) -#ifndef STATIC -# define STATIC noinline -#endif - #else /* !DEBUG */ #ifdef XFS_WARN @@ -254,21 +257,15 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) #define ASSERT(expr) \ (likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) -#ifndef STATIC -# define STATIC static noinline -#endif - #else /* !DEBUG && !XFS_WARN */ #define ASSERT(expr) ((void)0) -#ifndef STATIC -# define STATIC static noinline -#endif - #endif /* XFS_WARN */ #endif /* DEBUG */ +#define STATIC static noinline + #ifdef CONFIG_XFS_RT /* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c5107c7bc4bf..38d4227895ae 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -22,6 +22,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -608,6 +609,7 @@ xfs_log_mount( xfs_daddr_t blk_offset, int num_bblks) { + bool fatal = xfs_sb_version_hascrc(&mp->m_sb); int error = 0; int min_logfsbs; @@ -659,9 +661,20 @@ xfs_log_mount( XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), XFS_MAX_LOG_BYTES); error = -EINVAL; + } else if (mp->m_sb.sb_logsunit > 1 && + mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) { + xfs_warn(mp, + "log stripe unit %u bytes must be a multiple of block size", + mp->m_sb.sb_logsunit); + error = -EINVAL; + fatal = true; } if (error) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + /* + * Log check errors are always fatal on v5; or whenever bad + * metadata leads to a crash. + */ + if (fatal) { xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); ASSERT(0); goto out_free_log; @@ -744,6 +757,7 @@ xfs_log_mount_finish( { int error = 0; bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); + bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED; if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); @@ -780,6 +794,21 @@ xfs_log_mount_finish( mp->m_super->s_flags &= ~MS_ACTIVE; evict_inodes(mp->m_super); + /* + * Drain the buffer LRU after log recovery. This is required for v4 + * filesystems to avoid leaving around buffers with NULL verifier ops, + * but we do it unconditionally to make sure we're always in a clean + * cache state after mount. + * + * Don't push in the error case because the AIL may have pending intents + * that aren't removed until recovery is cancelled. + */ + if (!error && recovered) { + xfs_log_force(mp, XFS_LOG_SYNC); + xfs_ail_push_all_sync(mp->m_ail); + } + xfs_wait_buftarg(mp->m_ddev_targp); + if (readonly) mp->m_flags |= XFS_MOUNT_RDONLY; @@ -2515,7 +2544,7 @@ next_lv: if (lv) vecp = lv->lv_iovecp; } - if (record_cnt == 0 && ordered == false) { + if (record_cnt == 0 && !ordered) { if (!lv) return 0; break; @@ -3734,7 +3763,7 @@ xlog_ticket_alloc( * one of the iclogs. This uses backup pointers stored in a different * part of the log in case we trash the log structure. */ -void +STATIC void xlog_verify_dest_ptr( struct xlog *log, void *ptr) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 51bf7b827387..129975970d99 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -592,9 +592,9 @@ xlog_valid_lsn( * a transiently forward state. Instead, we can see the LSN in a * transiently behind state if we happen to race with a cycle wrap. */ - cur_cycle = ACCESS_ONCE(log->l_curr_cycle); + cur_cycle = READ_ONCE(log->l_curr_cycle); smp_rmb(); - cur_block = ACCESS_ONCE(log->l_curr_block); + cur_block = READ_ONCE(log->l_curr_block); if ((CYCLE_LSN(lsn) > cur_cycle) || (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ee34899396b2..87b1c331f9eb 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -85,17 +85,21 @@ struct xfs_buf_cancel { */ /* - * Verify the given count of basic blocks is valid number of blocks - * to specify for an operation involving the given XFS log buffer. - * Returns nonzero if the count is valid, 0 otherwise. + * Verify the log-relative block number and length in basic blocks are valid for + * an operation involving the given XFS log buffer. Returns true if the fields + * are valid, false otherwise. */ - -static inline int -xlog_buf_bbcount_valid( +static inline bool +xlog_verify_bp( struct xlog *log, + xfs_daddr_t blk_no, int bbcount) { - return bbcount > 0 && bbcount <= log->l_logBBsize; + if (blk_no < 0 || blk_no >= log->l_logBBsize) + return false; + if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize) + return false; + return true; } /* @@ -110,7 +114,11 @@ xlog_get_bp( { struct xfs_buf *bp; - if (!xlog_buf_bbcount_valid(log, nbblks)) { + /* + * Pass log block 0 since we don't have an addr yet, buffer will be + * verified on read. + */ + if (!xlog_verify_bp(log, 0, nbblks)) { xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); @@ -180,9 +188,10 @@ xlog_bread_noalign( { int error; - if (!xlog_buf_bbcount_valid(log, nbblks)) { - xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", - nbblks); + if (!xlog_verify_bp(log, blk_no, nbblks)) { + xfs_warn(log->l_mp, + "Invalid log block/length (0x%llx, 0x%x) for buffer", + blk_no, nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return -EFSCORRUPTED; } @@ -265,9 +274,10 @@ xlog_bwrite( { int error; - if (!xlog_buf_bbcount_valid(log, nbblks)) { - xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", - nbblks); + if (!xlog_verify_bp(log, blk_no, nbblks)) { + xfs_warn(log->l_mp, + "Invalid log block/length (0x%llx, 0x%x) for buffer", + blk_no, nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return -EFSCORRUPTED; } @@ -753,7 +763,7 @@ xlog_find_head( * in the in-core log. The following number can be made tighter if * we actually look at the block size of the filesystem. */ - num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); + num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log)); if (head_blk >= num_scan_bblks) { /* * We are guaranteed that the entire check can be performed @@ -2975,7 +2985,7 @@ xlog_recover_inode_pass2( struct xlog_recover_item *item, xfs_lsn_t current_lsn) { - xfs_inode_log_format_t *in_f; + struct xfs_inode_log_format *in_f; xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; xfs_dinode_t *dip; @@ -2989,10 +2999,10 @@ xlog_recover_inode_pass2( uint isize; int need_free = 0; - if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; } else { - in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); + in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) @@ -3163,16 +3173,8 @@ xlog_recover_inode_pass2( } fields = in_f->ilf_fields; - switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { - case XFS_ILOG_DEV: + if (fields & XFS_ILOG_DEV) xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); - break; - case XFS_ILOG_UUID: - memcpy(XFS_DFORK_DPTR(dip), - &in_f->ilf_u.ilfu_uuid, - sizeof(uuid_t)); - break; - } if (in_f->ilf_size == 2) goto out_owner_change; @@ -4297,7 +4299,7 @@ xlog_recover_add_to_trans( char *dp, int len) { - xfs_inode_log_format_t *in_f; /* any will do */ + struct xfs_inode_log_format *in_f; /* any will do */ xlog_recover_item_t *item; char *ptr; @@ -4331,7 +4333,7 @@ xlog_recover_add_to_trans( ptr = kmem_alloc(len, KM_SLEEP); memcpy(ptr, dp, len); - in_f = (xfs_inode_log_format_t *)ptr; + in_f = (struct xfs_inode_log_format *)ptr; /* take the tail entry */ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); @@ -5823,7 +5825,7 @@ xlog_recover_cancel( * Read all of the agf and agi counters and check that they * are consistent with the superblock counters. */ -void +STATIC void xlog_recover_check_summary( struct xlog *log) { diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 85401155750e..34447dca97d1 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __XFS_MESSAGE_H #define __XFS_MESSAGE_H 1 diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index ea7d4b4e50d0..c879b517cc94 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -704,7 +704,7 @@ xfs_mountfs( xfs_set_maxicount(mp); /* enable fail_at_unmount as default */ - mp->m_fail_unmount = 1; + mp->m_fail_unmount = true; error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); if (error) @@ -1022,10 +1022,21 @@ xfs_mountfs( xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); - cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_WAIT); /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); + /* + * Cancel all delayed reclaim work and reclaim the inodes directly. + * We have to do this /after/ rtunmount and qm_unmount because those + * two will have scheduled delayed reclaim for the rt/quota inodes. + * + * This is slightly different from the unmountfs call sequence + * because we could be tearing down a partially set up mount. In + * particular, if log_mount_finish fails we bail out without calling + * qm_unmount_quotas and therefore rely on qm_unmount to release the + * quota inodes. + */ + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp, SYNC_WAIT); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 0c381d71b242..0492436a053f 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); - XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56); + XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); } diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 4246876df7b7..aa6c5c193f45 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Christoph Hellwig. */ diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index b587cb99b2b7..bf45951e28fe 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XFS_PNFS_H #define _XFS_PNFS_H 1 diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 37e603bf1591..cc041a29eb70 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -273,7 +273,7 @@ xfs_reflink_reserve_cow( struct xfs_bmbt_irec got; int error = 0; bool eof = false, trimmed; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; /* * Search the COW fork extent list first. This serves two purposes: @@ -284,7 +284,7 @@ xfs_reflink_reserve_cow( * tree. */ - if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got)) + if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) eof = true; if (!eof && got.br_startoff <= imap->br_startoff) { trace_xfs_reflink_cow_found(ip, imap); @@ -312,7 +312,7 @@ xfs_reflink_reserve_cow( return error; error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - imap->br_blockcount, 0, &got, &idx, eof); + imap->br_blockcount, 0, &got, &icur, eof); if (error == -ENOSPC || error == -EDQUOT) trace_xfs_reflink_cow_enospc(ip, imap); if (error) @@ -353,29 +353,22 @@ xfs_reflink_convert_cow( xfs_off_t offset, xfs_off_t count) { - struct xfs_bmbt_irec got; - struct xfs_defer_ops dfops; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); - xfs_extnum_t idx; - bool found; - int error = 0; + xfs_filblks_t count_fsb = end_fsb - offset_fsb; + struct xfs_bmbt_irec imap; + struct xfs_defer_ops dfops; + xfs_fsblock_t first_block = NULLFSBLOCK; + int nimaps = 1, error = 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); + ASSERT(count != 0); - /* Convert all the extents to real from unwritten. */ - for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); - found && got.br_startoff < end_fsb; - found = xfs_iext_get_extent(ifp, ++idx, &got)) { - error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb, - end_fsb - offset_fsb, &dfops); - if (error) - break; - } - - /* Finish up. */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, + XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | + XFS_BMAPI_CONVERT_ONLY, &first_block, 0, &imap, &nimaps, + &dfops); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -399,7 +392,7 @@ xfs_reflink_allocate_cow( bool trimmed; xfs_filblks_t resaligned; xfs_extlen_t resblks = 0; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; retry: ASSERT(xfs_is_reflink_inode(ip)); @@ -409,7 +402,7 @@ retry: * Even if the extent is not shared we might have a preallocation for * it in the COW fork. If so use it. */ - if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) && + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) && got.br_startoff <= offset_fsb) { *shared = true; @@ -496,13 +489,13 @@ xfs_reflink_find_cow_mapping( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); xfs_fileoff_t offset_fsb; struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); ASSERT(xfs_is_reflink_inode(ip)); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got)) return false; if (got.br_startoff > offset_fsb) return false; @@ -524,18 +517,18 @@ xfs_reflink_trim_irec_to_next_cow( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; if (!xfs_is_reflink_inode(ip)) return; /* Find the extent in the CoW fork. */ - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got)) return; /* This is the extent before; try sliding up one. */ if (got.br_startoff < offset_fsb) { - if (!xfs_iext_get_extent(ifp, idx + 1, &got)) + if (!xfs_iext_next_extent(ifp, &icur, &got)) return; } @@ -562,24 +555,32 @@ xfs_reflink_cancel_cow_blocks( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; int error = 0; if (!xfs_is_reflink_inode(ip)) return 0; - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) + if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) return 0; - while (got.br_startoff < end_fsb) { + /* Walk backwards until we're out of the I/O range... */ + while (got.br_startoff + got.br_blockcount > offset_fsb) { del = got; xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); + + /* Extent delete may have bumped ext forward */ + if (!del.br_blockcount) { + xfs_iext_prev(ifp, &icur); + goto next_extent; + } + trace_xfs_reflink_cancel_cow(ip, &del); if (isnullstartblock(del.br_startblock)) { error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, - &idx, &got, &del); + &icur, &got, &del); if (error) break; } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { @@ -610,10 +611,10 @@ xfs_reflink_cancel_cow_blocks( } /* Remove the mapping from the CoW fork. */ - xfs_bmap_del_extent_cow(ip, &idx, &got, &del); + xfs_bmap_del_extent_cow(ip, &icur, &got, &del); } - - if (!xfs_iext_get_extent(ifp, ++idx, &got)) +next_extent: + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } @@ -698,7 +699,7 @@ xfs_reflink_end_cow( int error; unsigned int resblks; xfs_filblks_t rlen; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; trace_xfs_reflink_end_cow(ip, offset, count); @@ -733,27 +734,22 @@ xfs_reflink_end_cow( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - /* If there is a hole at end_fsb - 1 go to the previous extent */ - if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) || - got.br_startoff > end_fsb) { - /* - * In case of racing, overlapping AIO writes no COW extents - * might be left by the time I/O completes for the loser of - * the race. In that case we are done. - */ - if (idx <= 0) - goto out_cancel; - xfs_iext_get_extent(ifp, --idx, &got); - } + /* + * In case of racing, overlapping AIO writes no COW extents might be + * left by the time I/O completes for the loser of the race. In that + * case we are done. + */ + if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) + goto out_cancel; /* Walk backwards until we're out of the I/O range... */ while (got.br_startoff + got.br_blockcount > offset_fsb) { del = got; xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); - /* Extent delete may have bumped idx forward */ + /* Extent delete may have bumped ext forward */ if (!del.br_blockcount) { - idx--; + xfs_iext_prev(ifp, &icur); goto next_extent; } @@ -765,7 +761,7 @@ xfs_reflink_end_cow( * allocated but have not yet been involved in a write. */ if (got.br_state == XFS_EXT_UNWRITTEN) { - idx--; + xfs_iext_prev(ifp, &icur); goto next_extent; } @@ -796,14 +792,14 @@ xfs_reflink_end_cow( goto out_defer; /* Remove the mapping from the CoW fork. */ - xfs_bmap_del_extent_cow(ip, &idx, &got, &del); + xfs_bmap_del_extent_cow(ip, &icur, &got, &del); xfs_defer_ijoin(&dfops, ip); error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; next_extent: - if (!xfs_iext_get_extent(ifp, idx, &got)) + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } @@ -1433,7 +1429,7 @@ xfs_reflink_inode_has_shared_extents( xfs_extlen_t aglen; xfs_agblock_t rbno; xfs_extlen_t rlen; - xfs_extnum_t idx; + struct xfs_iext_cursor icur; bool found; int error; @@ -1445,7 +1441,7 @@ xfs_reflink_inode_has_shared_extents( } *has_shared = false; - found = xfs_iext_lookup_extent(ip, ifp, 0, &idx, &got); + found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); while (found) { if (isnullstartblock(got.br_startblock) || got.br_state != XFS_EXT_NORM) @@ -1464,7 +1460,7 @@ xfs_reflink_inode_has_shared_extents( return 0; } next: - found = xfs_iext_get_extent(ifp, ++idx, &got); + found = xfs_iext_next_extent(ifp, &icur, &got); } return 0; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 79defa722bf1..3f30f846d7f2 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -138,6 +138,7 @@ int xfs_rtalloc_query_range(struct xfs_trans *tp, int xfs_rtalloc_query_all(struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); +bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) @@ -146,6 +147,7 @@ int xfs_rtalloc_query_all(struct xfs_trans *tp, # define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) # define xfs_rtalloc_query_all(t,f,p) (ENOSYS) # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) +# define xfs_verify_rtbno(m, r) (false) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 584cf2d573ba..f663022353c0 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1637,7 +1637,7 @@ xfs_fs_fill_super( /* version 5 superblocks support inode version counters. */ if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) - sb->s_flags |= MS_I_VERSION; + sb->s_flags |= SB_I_VERSION; if (mp->m_flags & XFS_MOUNT_DAX) { xfs_warn(mp, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bb5514688d47..515ba042d75c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -218,53 +218,15 @@ TRACE_EVENT(xfs_attr_list_node_descend, __entry->bt_before) ); -TRACE_EVENT(xfs_iext_insert, - TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, - struct xfs_bmbt_irec *r, int state, unsigned long caller_ip), - TP_ARGS(ip, idx, r, state, caller_ip), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_extnum_t, idx) - __field(xfs_fileoff_t, startoff) - __field(xfs_fsblock_t, startblock) - __field(xfs_filblks_t, blockcount) - __field(xfs_exntst_t, state) - __field(int, bmap_state) - __field(unsigned long, caller_ip) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->idx = idx; - __entry->startoff = r->br_startoff; - __entry->startblock = r->br_startblock; - __entry->blockcount = r->br_blockcount; - __entry->state = r->br_state; - __entry->bmap_state = state; - __entry->caller_ip = caller_ip; - ), - TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %lld count %lld flag %d caller %ps", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), - (long)__entry->idx, - __entry->startoff, - (int64_t)__entry->startblock, - __entry->blockcount, - __entry->state, - (char *)__entry->caller_ip) -); - DECLARE_EVENT_CLASS(xfs_bmap_class, - TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, + TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state, unsigned long caller_ip), - TP_ARGS(ip, idx, state, caller_ip), + TP_ARGS(ip, cur, state, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(xfs_extnum_t, idx) + __field(void *, leaf); + __field(int, pos); __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -277,10 +239,11 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, struct xfs_bmbt_irec r; ifp = xfs_iext_state_to_fork(ip, state); - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); + xfs_iext_get_extent(ifp, cur, &r); __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->idx = idx; + __entry->leaf = cur->leaf; + __entry->pos = cur->pos; __entry->startoff = r.br_startoff; __entry->startblock = r.br_startblock; __entry->blockcount = r.br_blockcount; @@ -288,12 +251,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->bmap_state = state; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + TP_printk("dev %d:%d ino 0x%llx state %s cur 0x%p/%d " "offset %lld block %lld count %lld flag %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), - (long)__entry->idx, + __entry->leaf, + __entry->pos, __entry->startoff, (int64_t)__entry->startblock, __entry->blockcount, @@ -303,13 +267,15 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, #define DEFINE_BMAP_EVENT(name) \ DEFINE_EVENT(xfs_bmap_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state, \ unsigned long caller_ip), \ - TP_ARGS(ip, idx, state, caller_ip)) + TP_ARGS(ip, cur, state, caller_ip)) +DEFINE_BMAP_EVENT(xfs_iext_insert); DEFINE_BMAP_EVENT(xfs_iext_remove); DEFINE_BMAP_EVENT(xfs_bmap_pre_update); DEFINE_BMAP_EVENT(xfs_bmap_post_update); -DEFINE_BMAP_EVENT(xfs_extlist); +DEFINE_BMAP_EVENT(xfs_read_extent); +DEFINE_BMAP_EVENT(xfs_write_extent); DECLARE_EVENT_CLASS(xfs_buf_class, TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 354368a906e5..cef89f7127d3 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -25,6 +25,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_log.h" @@ -514,11 +515,26 @@ xfsaild( current->flags |= PF_MEMALLOC; set_freezable(); - while (!kthread_should_stop()) { + while (1) { if (tout && tout <= 20) - __set_current_state(TASK_KILLABLE); + set_current_state(TASK_KILLABLE); else - __set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); + + /* + * Check kthread_should_stop() after we set the task state + * to guarantee that we either see the stop bit and exit or + * the task state is reset to runnable such that it's not + * scheduled out indefinitely and detects the stop bit at + * next iteration. + * + * A memory barrier is included in above task state set to + * serialize again kthread_stop(). + */ + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } spin_lock(&ailp->xa_lock); |