diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-10 17:45:22 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-10 17:45:22 +0100 |
commit | 12958e9c4c8e93ef694c10960c78453edf21526e (patch) | |
tree | 80e4a3a945358d9a546c3f5cda25bc5ceadcdf7c | |
parent | Merge tag 'fsnotify_for_v6.8-rc1' of git://git.kernel.org/pub/scm/linux/kerne... (diff) | |
parent | xfs: use the op name in trace_xlog_intent_recovery_failed (diff) | |
download | linux-12958e9c4c8e93ef694c10960c78453edf21526e.tar.xz linux-12958e9c4c8e93ef694c10960c78453edf21526e.zip |
Merge tag 'xfs-6.8-merge-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs updates from Chandan Babu:
"New features/functionality:
- Online repair:
- Reserve disk space for online repairs
- Fix misinteraction between the AIL and btree bulkloader because
of which the bulk load fails to queue a buffer for writeback if
it happens to be on the AIL list
- Prevent transaction reservation overflows when reaping blocks
during online repair
- Whenever possible, bulkloader now copies multiple records into
a block
- Support repairing of
1. Per-AG free space, inode and refcount btrees
2. Ondisk inodes
3. File data and attribute fork mappings
- Verify the contents of
1. Inode and data fork of realtime bitmap file
2. Quota files
- Introduce MF_MEM_PRE_REMOVE. This will be used to notify tasks
about a pmem device being removed
Bug fixes:
- Fix memory leak of recovered attri intent items
- Fix UAF during log intent recovery
- Fix realtime geometry integer overflows
- Prevent scrub from live locking in xchk_iget
- Prevent fs shutdown when removing files during low free disk space
- Prevent transaction reservation overflow when extending an RT
device
- Prevent incorrect warning from being printed when extending a
filesystem
- Fix an off-by-one error in xreap_agextent_binval
- Serialize access to perag radix tree during deletion operation
- Fix perag memory leak during growfs
- Allow allocation of minlen realtime extent when the maximum sized
realtime free extent is minlen in size
Cleanups:
- Remove duplicate boilerplate code spread across functionality
associated with different log items
- Cleanup resblks interfaces
- Pass defer ops pointer to defer helpers instead of an enum
- Initialize di_crc in xfs_log_dinode to prevent KMSAN warnings
- Use static_assert() instead of BUILD_BUG_ON_MSG() to validate size
of structures and structure member offsets. This is done in order
to be able to share the code with userspace
- Move XFS documentation under a new directory specific to XFS
- Do not invoke deferred ops' ->create_done callback if the deferred
operation does not have an intent item associated with it
- Remove duplicate inclusion of header files from scrub/health.c
- Refactor Realtime code
- Cleanup attr code"
* tag 'xfs-6.8-merge-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (123 commits)
xfs: use the op name in trace_xlog_intent_recovery_failed
xfs: fix a use after free in xfs_defer_finish_recovery
xfs: turn the XFS_DA_OP_REPLACE checks in xfs_attr_shortform_addname into asserts
xfs: remove xfs_attr_sf_hdr_t
xfs: remove struct xfs_attr_shortform
xfs: use xfs_attr_sf_findname in xfs_attr_shortform_getvalue
xfs: remove xfs_attr_shortform_lookup
xfs: simplify xfs_attr_sf_findname
xfs: move the xfs_attr_sf_lookup tracepoint
xfs: return if_data from xfs_idata_realloc
xfs: make if_data a void pointer
xfs: fold xfs_rtallocate_extent into xfs_bmap_rtalloc
xfs: simplify and optimize the RT allocation fallback cascade
xfs: reorder the minlen and prod calculations in xfs_bmap_rtalloc
xfs: remove XFS_RTMIN/XFS_RTMAX
xfs: remove rt-wrappers from xfs_format.h
xfs: factor out a xfs_rtalloc_sumlevel helper
xfs: tidy up xfs_rtallocate_extent_exact
xfs: merge the calls to xfs_rtallocate_range in xfs_rtallocate_block
xfs: reflow the tail end of xfs_rtallocate_extent_block
...
146 files changed, 12727 insertions, 2943 deletions
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 09cade7eaefc..e18bc5ae3b35 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -121,8 +121,5 @@ Documentation for filesystem implementations. udf virtiofs vfat - xfs-delayed-logging-design - xfs-maintainer-entry-profile - xfs-self-describing-metadata - xfs-online-fsck-design + xfs/index zonefs diff --git a/Documentation/filesystems/xfs/index.rst b/Documentation/filesystems/xfs/index.rst new file mode 100644 index 000000000000..ab66c57a5d18 --- /dev/null +++ b/Documentation/filesystems/xfs/index.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +XFS Filesystem Documentation +============================ + +.. toctree:: + :maxdepth: 2 + :numbered: + + xfs-delayed-logging-design + xfs-maintainer-entry-profile + xfs-self-describing-metadata + xfs-online-fsck-design diff --git a/Documentation/filesystems/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst index 6402ab8e370c..6402ab8e370c 100644 --- a/Documentation/filesystems/xfs-delayed-logging-design.rst +++ b/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst diff --git a/Documentation/filesystems/xfs-maintainer-entry-profile.rst b/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst index 32b6ac4ca9d6..32b6ac4ca9d6 100644 --- a/Documentation/filesystems/xfs-maintainer-entry-profile.rst +++ b/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst diff --git a/Documentation/filesystems/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst index a0678101a7d0..352516feef6f 100644 --- a/Documentation/filesystems/xfs-online-fsck-design.rst +++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst @@ -962,7 +962,7 @@ disk, but these buffer verifiers cannot provide any consistency checking between metadata structures. For more information, please see the documentation for -Documentation/filesystems/xfs-self-describing-metadata.rst +Documentation/filesystems/xfs/xfs-self-describing-metadata.rst Reverse Mapping --------------- diff --git a/Documentation/filesystems/xfs-self-describing-metadata.rst b/Documentation/filesystems/xfs/xfs-self-describing-metadata.rst index a10c4ae6955e..a10c4ae6955e 100644 --- a/Documentation/filesystems/xfs-self-describing-metadata.rst +++ b/Documentation/filesystems/xfs/xfs-self-describing-metadata.rst diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst index 7ad4bfc2cc03..18cee1edaecb 100644 --- a/Documentation/maintainer/maintainer-entry-profile.rst +++ b/Documentation/maintainer/maintainer-entry-profile.rst @@ -105,4 +105,4 @@ to do something different in the near future. ../driver-api/media/maintainer-entry-profile ../driver-api/vfio-pci-device-specific-driver-acceptance ../nvme/feature-and-quirk-policy - ../filesystems/xfs-maintainer-entry-profile + ../filesystems/xfs/xfs-maintainer-entry-profile diff --git a/MAINTAINERS b/MAINTAINERS index 40c754b4c39c..83240dcd4065 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23846,10 +23846,10 @@ S: Supported W: http://xfs.org/ C: irc://irc.oftc.net/xfs T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git -P: Documentation/filesystems/xfs-maintainer-entry-profile.rst +P: Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst F: Documentation/ABI/testing/sysfs-fs-xfs F: Documentation/admin-guide/xfs.rst -F: Documentation/filesystems/xfs-* +F: Documentation/filesystems/xfs/* F: fs/xfs/ F: include/uapi/linux/dqblk_xfs.h F: include/uapi/linux/fsmap.h diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0da9232ea175..f4b635526345 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -326,7 +326,8 @@ void kill_dax(struct dax_device *dax_dev) return; if (dax_dev->holder_data != NULL) - dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0); + dax_holder_notify_failure(dax_dev, 0, U64_MAX, + MF_MEM_PRE_REMOVE); clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7762c01a85cf..fbe3cdc79036 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -145,6 +145,7 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) xfs-y += $(addprefix scrub/, \ trace.o \ + agb_bitmap.o \ agheader.o \ alloc.o \ attr.o \ @@ -175,14 +176,32 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtsummary.o \ ) -xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ + dqiterate.o \ + quota.o \ + ) # online repair ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + alloc_repair.o \ + bmap_repair.o \ + cow_repair.o \ + ialloc_repair.o \ + inode_repair.o \ + newbt.o \ reap.o \ + refcount_repair.o \ repair.o \ ) + +xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ + rtbitmap_repair.o \ + ) + +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ + quota_repair.o \ + ) endif endif diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index f9f4d694640d..39d9525270b7 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -332,6 +332,31 @@ xfs_agino_range( return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); } +/* + * Free perag within the specified AG range, it is only used to free unused + * perags under the error handling path. + */ +void +xfs_free_unused_perag_range( + struct xfs_mount *mp, + xfs_agnumber_t agstart, + xfs_agnumber_t agend) +{ + struct xfs_perag *pag; + xfs_agnumber_t index; + + for (index = agstart; index < agend; index++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, index); + spin_unlock(&mp->m_perag_lock); + if (!pag) + break; + xfs_buf_hash_destroy(pag); + xfs_defer_drain_free(&pag->pag_intents_drain); + kmem_free(pag); + } +} + int xfs_initialize_perag( struct xfs_mount *mp, @@ -424,19 +449,14 @@ xfs_initialize_perag( out_remove_pag: xfs_defer_drain_free(&pag->pag_intents_drain); + spin_lock(&mp->m_perag_lock); radix_tree_delete(&mp->m_perag_tree, index); + spin_unlock(&mp->m_perag_lock); out_free_pag: kmem_free(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ - for (index = first_initialised; index < agcount; index++) { - pag = radix_tree_delete(&mp->m_perag_tree, index); - if (!pag) - break; - xfs_buf_hash_destroy(pag); - xfs_defer_drain_free(&pag->pag_intents_drain); - kmem_free(pag); - } + xfs_free_unused_perag_range(mp, first_initialised, agcount); return error; } @@ -984,7 +1004,7 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, + err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, XFS_AG_RESV_NONE, true); if (err2) goto resv_err; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 2e0aef87d633..4b343c4fac28 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -80,6 +80,16 @@ struct xfs_perag { */ uint16_t pag_checked; uint16_t pag_sick; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Alternate btree heights so that online repair won't trip the write + * verifiers while rebuilding the AG btrees. + */ + uint8_t pagf_repair_levels[XFS_BTNUM_AGF]; + uint8_t pagf_repair_refcount_level; +#endif + spinlock_t pag_state_lock; spinlock_t pagb_lock; /* lock for pagb_tree */ @@ -133,6 +143,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) +void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart, + xfs_agnumber_t agend); int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 7fd1fea95552..da1057bd0e60 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -411,6 +411,8 @@ xfs_ag_resv_free_extent( fallthrough; case XFS_AG_RESV_NONE: xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); + fallthrough; + case XFS_AG_RESV_IGNORE: return; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 100ab5931b31..3bd0a33fee0a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -246,11 +246,9 @@ xfs_alloc_btrec_to_irec( /* Simple checks for free space records. */ xfs_failaddr_t xfs_alloc_check_irec( - struct xfs_btree_cur *cur, - const struct xfs_alloc_rec_incore *irec) + struct xfs_perag *pag, + const struct xfs_alloc_rec_incore *irec) { - struct xfs_perag *pag = cur->bc_ag.pag; - if (irec->ar_blockcount == 0) return __this_address; @@ -299,7 +297,7 @@ xfs_alloc_get_rec( return error; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur, &irec); + fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -2514,7 +2512,7 @@ xfs_defer_agfl_block( trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); + xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); return 0; } @@ -2522,14 +2520,15 @@ xfs_defer_agfl_block( * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ -int -__xfs_free_extent_later( +static int +xfs_defer_extent_free( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard) + bool skip_discard, + struct xfs_defer_pending **dfpp) { struct xfs_extent_free_item *xefi; struct xfs_mount *mp = tp->t_mountp; @@ -2577,10 +2576,105 @@ __xfs_free_extent_later( XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); return 0; } +int +xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, + bool skip_discard) +{ + struct xfs_defer_pending *dontcare = NULL; + + return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard, + &dontcare); +} + +/* + * Set up automatic freeing of unwritten space in the filesystem. + * + * This function attached a paused deferred extent free item to the + * transaction. Pausing means that the EFI will be logged in the next + * transaction commit, but the pending EFI will not be finished until the + * pending item is unpaused. + * + * If the system goes down after the EFI has been persisted to the log but + * before the pending item is unpaused, log recovery will find the EFI, fail to + * find the EFD, and free the space. + * + * If the pending item is unpaused, the next transaction commit will log an EFD + * without freeing the space. + * + * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the + * @args structure are set to the relevant values. + */ +int +xfs_alloc_schedule_autoreap( + const struct xfs_alloc_arg *args, + bool skip_discard, + struct xfs_alloc_autoreap *aarp) +{ + int error; + + error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, + &args->oinfo, args->resv, skip_discard, &aarp->dfp); + if (error) + return error; + + xfs_defer_item_pause(args->tp, aarp->dfp); + return 0; +} + +/* + * Cancel automatic freeing of unwritten space in the filesystem. + * + * Earlier, we created a paused deferred extent free item and attached it to + * this transaction so that we could automatically roll back a new space + * allocation if the system went down. Now we want to cancel the paused work + * item by marking the EFI stale so we don't actually free the space, unpausing + * the pending item and logging an EFD. + * + * The caller generally should have already mapped the space into the ondisk + * filesystem. If the reserved space was partially used, the caller must call + * xfs_free_extent_later to create a new EFI to free the unused space. + */ +void +xfs_alloc_cancel_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + struct xfs_defer_pending *dfp = aarp->dfp; + struct xfs_extent_free_item *xefi; + + if (!dfp) + return; + + list_for_each_entry(xefi, &dfp->dfp_work, xefi_list) + xefi->xefi_flags |= XFS_EFI_CANCELLED; + + xfs_defer_item_unpause(tp, dfp); +} + +/* + * Commit automatic freeing of unwritten space in the filesystem. + * + * This unpauses an earlier _schedule_autoreap and commits to freeing the + * allocated space. Call this if none of the reserved space was used. + */ +void +xfs_alloc_commit_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + if (aarp->dfp) + xfs_defer_item_unpause(tp, aarp->dfp); +} + #ifdef DEBUG /* * Check if an AGF has a free extent record whose length is equal to @@ -3848,7 +3942,7 @@ xfs_alloc_query_range_helper( xfs_failaddr_t fa; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur, &irec); + fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6bb8d295c321..0b956f8b9d5a 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -185,7 +185,7 @@ xfs_alloc_get_rec( union xfs_btree_rec; void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_alloc_rec_incore *irec); -xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_alloc_check_irec(struct xfs_perag *pag, const struct xfs_alloc_rec_incore *irec); int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, @@ -231,7 +231,7 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } -int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, +int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, bool skip_discard); @@ -255,18 +255,18 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ +#define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */ -static inline int -xfs_free_extent_later( - struct xfs_trans *tp, - xfs_fsblock_t bno, - xfs_filblks_t len, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) -{ - return __xfs_free_extent_later(tp, bno, len, oinfo, type, false); -} +struct xfs_alloc_autoreap { + struct xfs_defer_pending *dfp; +}; +int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args, + bool skip_discard, struct xfs_alloc_autoreap *aarp); +void xfs_alloc_cancel_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); +void xfs_alloc_commit_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); extern struct kmem_cache *xfs_extfree_item_cache; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index c65228efed4a..a7032bf0cd37 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -323,7 +323,18 @@ xfs_allocbt_verify( if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) btnum = XFS_BTNUM_CNTi; if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_levels[btnum]) + unsigned int maxlevel = pag->pagf_levels[btnum]; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the free space btrees, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_levels[btnum]); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_alloc_maxlevels) return __this_address; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e28d93d232de..9976a00a73f9 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -862,8 +862,11 @@ xfs_attr_lookup( if (!xfs_inode_hasattr(dp)) return -ENOATTR; - if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_sf_findname(args, NULL, NULL); + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) { + if (xfs_attr_sf_findname(args)) + return -EEXIST; + return -ENOATTR; + } if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_hasname(args, &bp); @@ -880,11 +883,10 @@ xfs_attr_lookup( return error; } -static int -xfs_attr_intent_init( +static void +xfs_attr_defer_add( struct xfs_da_args *args, - unsigned int op_flags, /* op flag (set or remove) */ - struct xfs_attr_intent **attr) /* new xfs_attr_intent */ + unsigned int op_flags) { struct xfs_attr_intent *new; @@ -893,66 +895,22 @@ xfs_attr_intent_init( new->xattri_op_flags = op_flags; new->xattri_da_args = args; - *attr = new; - return 0; -} - -/* Sets an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_add( - struct xfs_da_args *args) -{ - struct xfs_attr_intent *new; - int error = 0; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); - if (error) - return error; + switch (op_flags) { + case XFS_ATTRI_OP_FLAGS_SET: + new->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REPLACE: + new->xattri_dela_state = xfs_attr_init_replace_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REMOVE: + new->xattri_dela_state = xfs_attr_init_remove_state(args); + break; + default: + ASSERT(0); + } - new->xattri_dela_state = xfs_attr_init_add_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); - - return 0; -} - -/* Sets an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_replace( - struct xfs_da_args *args) -{ - struct xfs_attr_intent *new; - int error = 0; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_replace_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); - - return 0; -} - -/* Removes an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_remove( - struct xfs_da_args *args) -{ - - struct xfs_attr_intent *new; - int error; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_remove_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); - - return 0; } /* @@ -1038,16 +996,16 @@ xfs_attr_set( error = xfs_attr_lookup(args); switch (error) { case -EEXIST: - /* if no value, we are performing a remove operation */ if (!args->value) { - error = xfs_attr_defer_remove(args); + /* if no value, we are performing a remove operation */ + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE); break; } + /* Pure create fails if the attr already exists */ if (args->attr_flags & XATTR_CREATE) goto out_trans_cancel; - - error = xfs_attr_defer_replace(args); + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE); break; case -ENOATTR: /* Can't remove what isn't there. */ @@ -1057,14 +1015,11 @@ xfs_attr_set( /* Pure replace fails if no existing attr to replace. */ if (args->attr_flags & XATTR_REPLACE) goto out_trans_cancel; - - error = xfs_attr_defer_add(args); + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET); break; default: goto out_trans_cancel; } - if (error) - goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the @@ -1097,10 +1052,9 @@ out_trans_cancel: static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) { - struct xfs_attr_shortform *sf; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; - return be16_to_cpu(sf->hdr.totsize); + return be16_to_cpu(sf->totsize); } /* @@ -1112,19 +1066,13 @@ xfs_attr_shortform_addname( struct xfs_da_args *args) { int newsize, forkoff; - int error; trace_xfs_attr_sf_addname(args); - error = xfs_attr_shortform_lookup(args); - switch (error) { - case -ENOATTR: - if (args->op_flags & XFS_DA_OP_REPLACE) - return error; - break; - case -EEXIST: - if (!(args->op_flags & XFS_DA_OP_REPLACE)) - return error; + if (xfs_attr_sf_findname(args)) { + int error; + + ASSERT(args->op_flags & XFS_DA_OP_REPLACE); error = xfs_attr_sf_removename(args); if (error) @@ -1137,11 +1085,8 @@ xfs_attr_shortform_addname( * around. */ args->op_flags &= ~XFS_DA_OP_REPLACE; - break; - case 0: - break; - default: - return error; + } else { + ASSERT(!(args->op_flags & XFS_DA_OP_REPLACE)); } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2580ae47209a..6374bf107242 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -690,56 +690,32 @@ xfs_attr_shortform_create( ASSERT(ifp->if_bytes == 0); if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) ifp->if_format = XFS_DINODE_FMT_LOCAL; - xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); - hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data; + + hdr = xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); memset(hdr, 0, sizeof(*hdr)); hdr->totsize = cpu_to_be16(sizeof(*hdr)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } /* - * Return -EEXIST if attr is found, or -ENOATTR if not - * args: args containing attribute name and namelen - * sfep: If not null, pointer will be set to the last attr entry found on - -EEXIST. On -ENOATTR pointer is left at the last entry in the list - * basep: If not null, pointer is set to the byte offset of the entry in the - * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of - * the last entry in the list + * Return the entry if the attr in args is found, or NULL if not. */ -int +struct xfs_attr_sf_entry * xfs_attr_sf_findname( - struct xfs_da_args *args, - struct xfs_attr_sf_entry **sfep, - unsigned int *basep) + struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - unsigned int base = sizeof(struct xfs_attr_sf_hdr); - int size = 0; - int end; - int i; + struct xfs_attr_sf_hdr *sf = args->dp->i_af.if_data; + struct xfs_attr_sf_entry *sfe; - sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; - sfe = &sf->list[0]; - end = sf->hdr.count; - for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), - base += size, i++) { - size = xfs_attr_sf_entsize(sfe); - if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - continue; - break; + for (sfe = xfs_attr_sf_firstentry(sf); + sfe < xfs_attr_sf_endptr(sf); + sfe = xfs_attr_sf_nextentry(sfe)) { + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return sfe; } - if (sfep != NULL) - *sfep = sfe; - - if (basep != NULL) - *basep = base; - - if (i == end) - return -ENOATTR; - return -EEXIST; + return NULL; } /* @@ -751,38 +727,31 @@ xfs_attr_shortform_add( struct xfs_da_args *args, int forkoff) { - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_ifork *ifp = &dp->i_af; + struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; - int offset, size; - struct xfs_mount *mp; - struct xfs_inode *dp; - struct xfs_ifork *ifp; + int size; trace_xfs_attr_sf_add(args); - dp = args->dp; - mp = dp->i_mount; dp->i_forkoff = forkoff; - ifp = &dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) - ASSERT(0); + ASSERT(!xfs_attr_sf_findname(args)); - offset = (char *)sfe - (char *)sf; size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); + sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK); + sfe = xfs_attr_sf_endptr(sf); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); - sf->hdr.count++; - be16_add_cpu(&sf->hdr.totsize, size); + sf->count++; + be16_add_cpu(&sf->totsize, size); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); xfs_sbversion_add_attr2(mp, args->trans); @@ -811,48 +780,43 @@ int xfs_attr_sf_removename( struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; - int size = 0, end, totsize; - unsigned int base; - struct xfs_mount *mp; - struct xfs_inode *dp; - int error; + uint16_t totsize = be16_to_cpu(sf->totsize); + void *next, *end; + int size = 0; trace_xfs_attr_sf_remove(args); - dp = args->dp; - mp = dp->i_mount; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; - - error = xfs_attr_sf_findname(args, &sfe, &base); - - /* - * If we are recovering an operation, finding nothing to - * remove is not an error - it just means there was nothing - * to clean up. - */ - if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY)) - return 0; - if (error != -EEXIST) - return error; - size = xfs_attr_sf_entsize(sfe); + sfe = xfs_attr_sf_findname(args); + if (!sfe) { + /* + * If we are recovering an operation, finding nothing to remove + * is not an error, it just means there was nothing to clean up. + */ + if (args->op_flags & XFS_DA_OP_RECOVERY) + return 0; + return -ENOATTR; + } /* * Fix up the attribute fork data, covering the hole */ - end = base + size; - totsize = be16_to_cpu(sf->hdr.totsize); - if (end != totsize) - memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end); - sf->hdr.count--; - be16_add_cpu(&sf->hdr.totsize, -size); + size = xfs_attr_sf_entsize(sfe); + next = xfs_attr_sf_nextentry(sfe); + end = xfs_attr_sf_endptr(sf); + if (next < end) + memmove(sfe, next, end - next); + sf->count--; + totsize -= size; + sf->totsize = cpu_to_be16(totsize); /* * Fix up the start offset of the attribute fork */ - totsize -= size; - if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && + if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { xfs_attr_fork_remove(dp, args->trans); @@ -860,7 +824,7 @@ xfs_attr_sf_removename( xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_forkoff); - ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || + ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE); @@ -874,33 +838,6 @@ xfs_attr_sf_removename( } /* - * Look up a name in a shortform attribute list structure. - */ -/*ARGSUSED*/ -int -xfs_attr_shortform_lookup(xfs_da_args_t *args) -{ - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - int i; - struct xfs_ifork *ifp; - - trace_xfs_attr_sf_lookup(args); - - ifp = &args->dp->i_af; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; - sfe = xfs_attr_sf_nextentry(sfe), i++) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - return -EEXIST; - } - return -ENOATTR; -} - -/* * Retrieve the attribute value and length. * * If args->valuelen is zero, only the length needs to be returned. Unlike a @@ -909,23 +846,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) */ int xfs_attr_shortform_getvalue( - struct xfs_da_args *args) + struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - int i; + struct xfs_attr_sf_entry *sfe; ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; - sfe = xfs_attr_sf_nextentry(sfe), i++) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - return xfs_attr_copy_value(args, - &sfe->nameval[args->namelen], sfe->valuelen); - } - return -ENOATTR; + + trace_xfs_attr_sf_lookup(args); + + sfe = xfs_attr_sf_findname(args); + if (!sfe) + return -ENOATTR; + return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], + sfe->valuelen); } /* Convert from using the shortform to the leaf format. */ @@ -933,26 +866,23 @@ int xfs_attr_shortform_to_leaf( struct xfs_da_args *args) { - struct xfs_inode *dp; - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_ifork *ifp = &dp->i_af; + struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; + int size = be16_to_cpu(sf->totsize); struct xfs_da_args nargs; char *tmpbuffer; - int error, i, size; + int error, i; xfs_dablk_t blkno; struct xfs_buf *bp; - struct xfs_ifork *ifp; trace_xfs_attr_sf_to_leaf(args); - dp = args->dp; - ifp = &dp->i_af; - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); - memcpy(tmpbuffer, ifp->if_u1.if_data, size); - sf = (struct xfs_attr_shortform *)tmpbuffer; + memcpy(tmpbuffer, ifp->if_data, size); + sf = (struct xfs_attr_sf_hdr *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); @@ -975,8 +905,8 @@ xfs_attr_shortform_to_leaf( nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; i++) { + sfe = xfs_attr_sf_firstentry(sf); + for (i = 0; i < sf->count; i++) { nargs.name = sfe->nameval; nargs.namelen = sfe->namelen; nargs.value = &sfe->nameval[nargs.namelen]; @@ -1040,23 +970,16 @@ xfs_attr_shortform_allfit( return xfs_attr_shortform_bytesfit(dp, bytes); } -/* Verify the consistency of an inline attribute fork. */ +/* Verify the consistency of a raw inline attribute fork. */ xfs_failaddr_t xfs_attr_shortform_verify( - struct xfs_inode *ip) + struct xfs_attr_sf_hdr *sfp, + size_t size) { - struct xfs_attr_shortform *sfp; - struct xfs_attr_sf_entry *sfep; + struct xfs_attr_sf_entry *sfep = xfs_attr_sf_firstentry(sfp); struct xfs_attr_sf_entry *next_sfep; char *endp; - struct xfs_ifork *ifp; int i; - int64_t size; - - ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); - ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); - sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - size = ifp->if_bytes; /* * Give up if the attribute is way too short. @@ -1067,8 +990,7 @@ xfs_attr_shortform_verify( endp = (char *)sfp + size; /* Check all reported entries */ - sfep = &sfp->list[0]; - for (i = 0; i < sfp->hdr.count; i++) { + for (i = 0; i < sfp->count; i++) { /* * struct xfs_attr_sf_entry has a variable length. * Check the fixed-offset parts of the structure are @@ -1244,14 +1166,10 @@ xfs_attr3_leaf_to_node( if (error) goto out; - /* copy leaf to new buffer, update identifiers */ - xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); - bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); - if (xfs_has_crc(mp)) { - struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; - hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2)); - } + /* + * Copy leaf to new buffer and log it. + */ + xfs_da_buf_copy(bp2, bp1, args->geo->blksize); xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 368f4d9fa1d5..9b9948639c0f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -47,16 +47,14 @@ struct xfs_attr3_icleaf_hdr { */ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); -int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_sf_removename(struct xfs_da_args *args); -int xfs_attr_sf_findname(struct xfs_da_args *args, - struct xfs_attr_sf_entry **sfep, - unsigned int *basep); +struct xfs_attr_sf_entry *xfs_attr_sf_findname(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); -xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_sf_hdr *sfp, + size_t size); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index 37578b369d9b..bc4422223024 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -7,14 +7,6 @@ #define __XFS_ATTR_SF_H__ /* - * Attribute storage when stored inside the inode. - * - * Small attribute lists are packed as tightly as possible so as - * to fit into the literal area of the inode. - */ -typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; - -/* * We generate this then sort it, attr_list() must return things in hash-order. */ typedef struct xfs_attr_sf_sort { @@ -41,11 +33,25 @@ static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep) return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen); } -/* next entry in struct */ +/* first entry in the SF attr fork */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_firstentry(struct xfs_attr_sf_hdr *hdr) +{ + return (struct xfs_attr_sf_entry *)(hdr + 1); +} + +/* next entry after sfep */ static inline struct xfs_attr_sf_entry * xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) { return (void *)sfep + xfs_attr_sf_entsize(sfep); } +/* pointer to the space after the last entry, e.g. for adding a new one */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_endptr(struct xfs_attr_sf_hdr *sf) +{ + return (void *)sf + be16_to_cpu(sf->totsize); +} + #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index be62acffad6c..98aaca933bdd 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -575,7 +575,7 @@ xfs_bmap_btree_to_extents( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; @@ -747,7 +747,7 @@ xfs_bmap_local_to_extents_empty( ASSERT(ifp->if_nextents == 0); xfs_bmap_forkoff_reset(ip, whichfork); - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; ifp->if_format = XFS_DINODE_FMT_EXTENTS; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -832,7 +832,7 @@ xfs_bmap_local_to_extents( xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags |= XFS_ILOG_CORE; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; rec.br_startoff = 0; @@ -3044,7 +3044,8 @@ xfs_bmap_extsize_align( #define XFS_ALLOC_GAP_UNITS 4 -void +/* returns true if ap->blkno was modified */ +bool xfs_bmap_adjacent( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { @@ -3079,13 +3080,14 @@ xfs_bmap_adjacent( if (adjust && ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) ap->blkno += adjust; + return true; } /* * If not at eof, then compare the two neighbor blocks. * Figure out whether either one gives us a good starting point, * and pick the better one. */ - else if (!ap->eof) { + if (!ap->eof) { xfs_fsblock_t gotbno; /* right side block number */ xfs_fsblock_t gotdiff=0; /* right side difference */ xfs_fsblock_t prevbno; /* left side block number */ @@ -3165,14 +3167,21 @@ xfs_bmap_adjacent( * If both valid, pick the better one, else the only good * one, else ap->blkno is already set (to 0 or the inode block). */ - if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) + if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) { ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; - else if (prevbno != NULLFSBLOCK) + return true; + } + if (prevbno != NULLFSBLOCK) { ap->blkno = prevbno; - else if (gotbno != NULLFSBLOCK) + return true; + } + if (gotbno != NULLFSBLOCK) { ap->blkno = gotbno; + return true; + } } #undef ISVALID + return false; } int @@ -3263,11 +3272,14 @@ xfs_bmap_btalloc_select_lengths( } /* Update all inode and quota accounting for the allocation we just did. */ -static void -xfs_bmap_btalloc_accounting( - struct xfs_bmalloca *ap, - struct xfs_alloc_arg *args) +void +xfs_bmap_alloc_account( + struct xfs_bmalloca *ap) { + bool isrt = XFS_IS_REALTIME_INODE(ap->ip) && + (ap->flags & XFS_BMAPI_ATTRFORK); + uint fld; + if (ap->flags & XFS_BMAPI_COWFORK) { /* * COW fork blocks are in-core only and thus are treated as @@ -3279,7 +3291,7 @@ xfs_bmap_btalloc_accounting( * yet. */ if (ap->wasdel) { - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); return; } @@ -3291,22 +3303,25 @@ xfs_bmap_btalloc_accounting( * This essentially transfers the transaction quota reservation * to that of a delalloc extent. */ - ap->ip->i_delayed_blks += args->len; - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS, - -(long)args->len); + ap->ip->i_delayed_blks += ap->length; + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, isrt ? + XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS, + -(long)ap->length); return; } /* data/attr fork only */ - ap->ip->i_nblocks += args->len; + ap->ip->i_nblocks += ap->length; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { - ap->ip->i_delayed_blks -= args->len; - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); + ap->ip->i_delayed_blks -= ap->length; + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT; + } else { + fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; } - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT, - args->len); + + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, fld, ap->length); } static int @@ -3380,7 +3395,7 @@ xfs_bmap_process_allocated_extent( ap->offset = orig_offset; else if (ap->offset + ap->length < orig_offset + orig_length) ap->offset = orig_offset + orig_length - ap->length; - xfs_bmap_btalloc_accounting(ap, args); + xfs_bmap_alloc_account(ap); } #ifdef DEBUG @@ -5010,7 +5025,6 @@ xfs_bmap_del_extent_real( xfs_fileoff_t del_endoff; /* first offset past del */ int do_fx; /* free extent at end of routine */ int error; /* error return value */ - int flags = 0;/* inode logging flags */ struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -5023,6 +5037,8 @@ xfs_bmap_del_extent_real( uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; + *logflagsp = 0; + mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); @@ -5035,7 +5051,6 @@ xfs_bmap_del_extent_real( ASSERT(got_endoff >= del_endoff); ASSERT(!isnullstartblock(got.br_startblock)); qfield = 0; - error = 0; /* * If it's the case where the directory code is running with no block @@ -5051,13 +5066,13 @@ xfs_bmap_del_extent_real( del->br_startoff > got.br_startoff && del_endoff < got_endoff) return -ENOSPC; - flags = XFS_ILOG_CORE; + *logflagsp = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { if (!(bflags & XFS_BMAPI_REMAP)) { error = xfs_rtfree_blocks(tp, del->br_startblock, del->br_blockcount); if (error) - goto done; + return error; } do_fx = 0; @@ -5072,11 +5087,9 @@ xfs_bmap_del_extent_real( if (cur) { error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } if (got.br_startoff == del->br_startoff) @@ -5093,17 +5106,15 @@ xfs_bmap_del_extent_real( xfs_iext_prev(ifp, icur); ifp->if_nextents--; - flags |= XFS_ILOG_CORE; + *logflagsp |= XFS_ILOG_CORE; if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_btree_delete(cur, &i))) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; break; case BMAP_LEFT_FILLING: /* @@ -5114,12 +5125,12 @@ xfs_bmap_del_extent_real( got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; break; case BMAP_RIGHT_FILLING: /* @@ -5128,12 +5139,12 @@ xfs_bmap_del_extent_real( got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; break; case 0: /* @@ -5150,18 +5161,18 @@ xfs_bmap_del_extent_real( new.br_state = got.br_state; new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; + *logflagsp |= XFS_ILOG_CORE; if (cur) { error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; error = xfs_btree_increment(cur, 0, &i); if (error) - goto done; + return error; cur->bc_rec.b = new; error = xfs_btree_insert(cur, &i); if (error && error != -ENOSPC) - goto done; + return error; /* * If get no-space back from btree insert, it tried a * split, and we have a zero block reservation. Fix up @@ -5174,33 +5185,28 @@ xfs_bmap_del_extent_real( */ error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; /* * Update the btree record back * to the original value. */ error = xfs_bmbt_update(cur, &old); if (error) - goto done; + return error; /* * Reset the extent record back * to the original value. */ xfs_iext_update_extent(ip, state, icur, &old); - flags = 0; - error = -ENOSPC; - goto done; - } - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; + *logflagsp = 0; + return -ENOSPC; } + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } else - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); ifp->if_nextents++; xfs_iext_next(ifp, icur); @@ -5218,13 +5224,13 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - error = __xfs_free_extent_later(tp, del->br_startblock, + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, ((bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN)); if (error) - goto done; + return error; } } @@ -5239,9 +5245,7 @@ xfs_bmap_del_extent_real( if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); -done: - *logflagsp = flags; - return error; + return 0; } /* @@ -5250,7 +5254,7 @@ done: * that value. If not all extents in the block range can be removed then * *done is set. */ -int /* error */ +static int __xfs_bunmapi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ @@ -6102,7 +6106,7 @@ __xfs_bmap_add( bi->bi_bmap = *bmap; xfs_bmap_update_get_group(tp->t_mountp, bi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); + xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); return 0; } @@ -6179,19 +6183,18 @@ xfs_bmap_finish_one( return error; } -/* Check that an inode's extent does not have invalid flags or bad ranges. */ +/* Check that an extent does not have invalid flags or bad ranges. */ xfs_failaddr_t -xfs_bmap_validate_extent( - struct xfs_inode *ip, +xfs_bmap_validate_extent_raw( + struct xfs_mount *mp, + bool rtfile, int whichfork, struct xfs_bmbt_irec *irec) { - struct xfs_mount *mp = ip->i_mount; - if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) return __this_address; - if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) { + if (rtfile && whichfork == XFS_DATA_FORK) { if (!xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount)) return __this_address; @@ -6221,3 +6224,53 @@ xfs_bmap_intent_destroy_cache(void) kmem_cache_destroy(xfs_bmap_intent_cache); xfs_bmap_intent_cache = NULL; } + +/* Check that an inode's extent does not have invalid flags or bad ranges. */ +xfs_failaddr_t +xfs_bmap_validate_extent( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *irec) +{ + return xfs_bmap_validate_extent_raw(ip->i_mount, + XFS_IS_REALTIME_INODE(ip), whichfork, irec); +} + +/* + * Used in xfs_itruncate_extents(). This is the maximum number of extents + * freed from a file in a single transaction. + */ +#define XFS_ITRUNC_MAX_EXTENTS 2 + +/* + * Unmap every extent in part of an inode's fork. We don't do any higher level + * invalidation work at all. + */ +int +xfs_bunmapi_range( + struct xfs_trans **tpp, + struct xfs_inode *ip, + uint32_t flags, + xfs_fileoff_t startoff, + xfs_fileoff_t endoff) +{ + xfs_filblks_t unmap_len = endoff - startoff + 1; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + while (unmap_len > 0) { + ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); + error = __xfs_bunmapi(*tpp, ip, startoff, &unmap_len, flags, + XFS_ITRUNC_MAX_EXTENTS); + if (error) + goto out; + + /* free the just unmapped extents */ + error = xfs_defer_finish(tpp); + if (error) + goto out; + } +out: + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e33470e39728..f6b73f1bad5f 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -116,6 +116,8 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) return XFS_DATA_FORK; } +void xfs_bmap_alloc_account(struct xfs_bmalloca *ap); + /* * Special values for xfs_bmbt_irec_t br_startblock field. */ @@ -190,9 +192,6 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); -int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, - xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); @@ -263,6 +262,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork) } } +xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile, + int whichfork, struct xfs_bmbt_irec *irec); xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, @@ -271,6 +272,8 @@ int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, uint32_t flags); +int xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip, + uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff); extern struct kmem_cache *xfs_bmap_intent_cache; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index bf3f1b36fdd2..71f2d50f7823 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -15,6 +15,7 @@ #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" @@ -272,7 +273,7 @@ xfs_bmbt_free_block( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; @@ -288,10 +289,7 @@ xfs_bmbt_get_minrecs( int level) { if (level == cur->bc_nlevels - 1) { - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(cur->bc_ino.ip, - cur->bc_ino.whichfork); + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; @@ -306,10 +304,7 @@ xfs_bmbt_get_maxrecs( int level) { if (level == cur->bc_nlevels - 1) { - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(cur->bc_ino.ip, - cur->bc_ino.whichfork); + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); @@ -543,23 +538,19 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .keys_contiguous = xfs_bmbt_keys_contiguous, }; -/* - * Allocate a new bmap btree cursor. - */ -struct xfs_btree_cur * /* new bmap btree cursor */ -xfs_bmbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* inode owning the btree */ - int whichfork) /* data or attr fork */ +static struct xfs_btree_cur * +xfs_bmbt_init_common( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; + ASSERT(whichfork != XFS_COW_FORK); cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); - cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; @@ -567,10 +558,30 @@ xfs_bmbt_init_cursor( if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; cur->bc_ino.flags = 0; + + return cur; +} + +/* + * Allocate a new bmap btree cursor. + */ +struct xfs_btree_cur * +xfs_bmbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + struct xfs_btree_cur *cur; + + cur = xfs_bmbt_init_common(mp, tp, ip, whichfork); + + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.whichfork = whichfork; return cur; @@ -588,6 +599,76 @@ xfs_bmbt_block_maxrecs( } /* + * Allocate a new bmap btree cursor for reloading an inode block mapping data + * structure. Note that callers can use the staged cursor to reload extents + * format inode forks if they rebuild the iext tree and commit the staged + * cursor immediately. + */ +struct xfs_btree_cur * +xfs_bmbt_stage_cursor( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake) +{ + struct xfs_btree_cur *cur; + struct xfs_btree_ops *ops; + + /* data fork always has larger maxheight */ + cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK); + cur->bc_nlevels = ifake->if_levels; + cur->bc_ino.forksize = ifake->if_fork_size; + + /* Don't let anyone think we're attached to the real fork yet. */ + cur->bc_ino.whichfork = -1; + xfs_btree_stage_ifakeroot(cur, ifake, &ops); + ops->update_cursor = NULL; + return cur; +} + +/* + * Swap in the new inode fork root. Once we pass this point the newly rebuilt + * mappings are in place and we have to kill off any old btree blocks. + */ +void +xfs_bmbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + int whichfork) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + static const short brootflag[2] = {XFS_ILOG_DBROOT, XFS_ILOG_ABROOT}; + static const short extflag[2] = {XFS_ILOG_DEXT, XFS_ILOG_AEXT}; + int flags = XFS_ILOG_CORE; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(whichfork != XFS_COW_FORK); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, whichfork); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + switch (ifp->if_format) { + case XFS_DINODE_FMT_EXTENTS: + flags |= extflag[whichfork]; + break; + case XFS_DINODE_FMT_BTREE: + flags |= brootflag[whichfork]; + break; + default: + ASSERT(0); + break; + } + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops); +} + +/* * Calculate number of records in a bmap btree block. */ int diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 3e7a40a83835..151b8491f60e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -11,6 +11,7 @@ struct xfs_btree_block; struct xfs_mount; struct xfs_inode; struct xfs_trans; +struct xbtree_ifakeroot; /* * Btree block header size depends on a superblock flag. @@ -106,6 +107,10 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp, + struct xfs_inode *ip, struct xbtree_ifakeroot *ifake); +void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, int whichfork); extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, unsigned long long len); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 6a6503ab0cd7..ea8d3659df20 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1330,7 +1330,7 @@ xfs_btree_get_buf_block( * Read in the buffer at the given ptr and return the buffer and * the block pointer within the buffer. */ -STATIC int +int xfs_btree_read_buf_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, @@ -5212,3 +5212,29 @@ xfs_btree_destroy_cur_caches(void) xfs_rmapbt_destroy_cur_cache(); xfs_refcountbt_destroy_cur_cache(); } + +/* Move the btree cursor before the first record. */ +int +xfs_btree_goto_left_edge( + struct xfs_btree_cur *cur) +{ + int stat = 0; + int error; + + memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) + return 0; + + error = xfs_btree_decrement(cur, 0, &stat); + if (error) + return error; + if (stat != 0) { + ASSERT(0); + return -EFSCORRUPTED; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4d68a58be160..d906324e25c8 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -700,6 +700,9 @@ void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, struct xfs_buf **bpp); +int xfs_btree_read_buf_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int flags, + struct xfs_btree_block **block, struct xfs_buf **bpp); void xfs_btree_set_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr); @@ -735,4 +738,6 @@ xfs_btree_alloc_cursor( int __init xfs_btree_init_cur_caches(void); void xfs_btree_destroy_cur_caches(void); +int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index dd75e208b543..e276eba87cb1 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -333,20 +333,41 @@ xfs_btree_commit_ifakeroot( /* * Put a btree block that we're loading onto the ordered list and release it. * The btree blocks will be written to disk when bulk loading is finished. + * If we reach the dirty buffer threshold, flush them to disk before + * continuing. */ -static void +static int xfs_btree_bload_drop_buf( - struct list_head *buffers_list, - struct xfs_buf **bpp) + struct xfs_btree_bload *bbl, + struct list_head *buffers_list, + struct xfs_buf **bpp) { - if (*bpp == NULL) - return; + struct xfs_buf *bp = *bpp; + int error; + + if (!bp) + return 0; - if (!xfs_buf_delwri_queue(*bpp, buffers_list)) - ASSERT(0); + /* + * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent + * xfs_buf_read will not pointlessly reread the contents from the disk. + */ + bp->b_flags |= XBF_DONE; - xfs_buf_relse(*bpp); + xfs_buf_delwri_queue_here(bp, buffers_list); + xfs_buf_relse(bp); *bpp = NULL; + bbl->nr_dirty++; + + if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty) + return 0; + + error = xfs_buf_delwri_submit(buffers_list); + if (error) + return error; + + bbl->nr_dirty = 0; + return 0; } /* @@ -384,7 +405,7 @@ xfs_btree_bload_prep_block( ASSERT(*bpp == NULL); /* Allocate a new incore btree root block. */ - new_size = bbl->iroot_size(cur, nr_this_block, priv); + new_size = bbl->iroot_size(cur, level, nr_this_block, priv); ifp->if_broot = kmem_zalloc(new_size, 0); ifp->if_broot_bytes = (int)new_size; @@ -418,7 +439,10 @@ xfs_btree_bload_prep_block( */ if (*blockp) xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); - xfs_btree_bload_drop_buf(buffers_list, bpp); + + ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp); + if (ret) + return ret; /* Initialize the new btree block. */ xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); @@ -436,22 +460,19 @@ STATIC int xfs_btree_bload_leaf( struct xfs_btree_cur *cur, unsigned int recs_this_block, - xfs_btree_bload_get_record_fn get_record, + xfs_btree_bload_get_records_fn get_records, struct xfs_btree_block *block, void *priv) { - unsigned int j; + unsigned int j = 1; int ret; /* Fill the leaf block with records. */ - for (j = 1; j <= recs_this_block; j++) { - union xfs_btree_rec *block_rec; - - ret = get_record(cur, priv); - if (ret) + while (j <= recs_this_block) { + ret = get_records(cur, j, block, recs_this_block - j + 1, priv); + if (ret < 0) return ret; - block_rec = xfs_btree_rec_addr(cur, j, block); - cur->bc_ops->init_rec_from_cur(cur, block_rec); + j += ret; } return 0; @@ -485,7 +506,12 @@ xfs_btree_bload_node( ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); - ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, + /* + * Read the lower-level block in case the buffer for it has + * been reclaimed. LRU refs will be set on the block, which is + * desirable if the new btree commits. + */ + ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block, &child_bp); if (ret) return ret; @@ -570,7 +596,14 @@ xfs_btree_bload_level_geometry( unsigned int desired_npb; unsigned int maxnr; - maxnr = cur->bc_ops->get_maxrecs(cur, level); + /* + * Compute the absolute maximum number of records that we can store in + * the ondisk block or inode root. + */ + if (cur->bc_ops->get_dmaxrecs) + maxnr = cur->bc_ops->get_dmaxrecs(cur, level); + else + maxnr = cur->bc_ops->get_maxrecs(cur, level); /* * Compute the number of blocks we need to fill each block with the @@ -764,6 +797,7 @@ xfs_btree_bload( cur->bc_nlevels = bbl->btree_height; xfs_btree_set_ptr_null(cur, &child_ptr); xfs_btree_set_ptr_null(cur, &ptr); + bbl->nr_dirty = 0; xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &blocks, &blocks_with_extra); @@ -789,7 +823,7 @@ xfs_btree_bload( trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, nr_this_block); - ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, + ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records, block, priv); if (ret) goto out; @@ -802,7 +836,10 @@ xfs_btree_bload( xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); } total_blocks += blocks; - xfs_btree_bload_drop_buf(&buffers_list, &bp); + + ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); + if (ret) + goto out; /* Populate the internal btree nodes. */ for (level = 1; level < cur->bc_nlevels; level++) { @@ -844,7 +881,11 @@ xfs_btree_bload( xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); } total_blocks += blocks; - xfs_btree_bload_drop_buf(&buffers_list, &bp); + + ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); + if (ret) + goto out; + xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); } diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index f0d2976050ae..055ea43b1e18 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -37,12 +37,6 @@ struct xbtree_ifakeroot { /* Number of bytes available for this fork in the inode. */ unsigned int if_fork_size; - - /* Fork format. */ - unsigned int if_format; - - /* Number of records. */ - unsigned int if_extents; }; /* Cursor interactions with fake roots for inode-rooted btrees. */ @@ -53,19 +47,24 @@ void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, int whichfork, const struct xfs_btree_ops *ops); /* Bulk loading of staged btrees. */ -typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv); +typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur, + unsigned int idx, struct xfs_btree_block *block, + unsigned int nr_wanted, void *priv); typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv); typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, - unsigned int nr_this_level, void *priv); + unsigned int level, unsigned int nr_this_level, void *priv); struct xfs_btree_bload { /* - * This function will be called nr_records times to load records into - * the btree. The function does this by setting the cursor's bc_rec - * field in in-core format. Records must be returned in sort order. + * This function will be called to load @nr_wanted records into the + * btree. The implementation does this by setting the cursor's bc_rec + * field in in-core format and using init_rec_from_cur to set the + * records in the btree block. Records must be returned in sort order. + * The function must return the number of records loaded or the usual + * negative errno. */ - xfs_btree_bload_get_record_fn get_record; + xfs_btree_bload_get_records_fn get_records; /* * This function will be called nr_blocks times to obtain a pointer @@ -113,6 +112,16 @@ struct xfs_btree_bload { * height of the new btree. */ unsigned int btree_height; + + /* + * Flush the new btree block buffer list to disk after this many blocks + * have been formatted. Zero prohibits writing any buffers until all + * blocks have been formatted. + */ + uint16_t max_dirty; + + /* Number of dirty buffers. */ + uint16_t nr_dirty; }; int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e576560b46e9..5457188bb4de 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -421,6 +421,25 @@ xfs_da3_node_read_mapped( return xfs_da3_node_set_type(tp, *bpp); } +/* + * Copy src directory/attr leaf/node buffer to the dst. + * For v5 file systems make sure the right blkno is stamped in. + */ +void +xfs_da_buf_copy( + struct xfs_buf *dst, + struct xfs_buf *src, + size_t size) +{ + struct xfs_da3_blkinfo *da3 = dst->b_addr; + + memcpy(dst->b_addr, src->b_addr, size); + dst->b_ops = src->b_ops; + xfs_trans_buf_copy_type(dst, src); + if (xfs_has_crc(dst->b_mount)) + da3->blkno = cpu_to_be64(xfs_buf_daddr(dst)); +} + /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -690,12 +709,6 @@ xfs_da3_root_split( btree = icnodehdr.btree; size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); level = icnodehdr.level; - - /* - * we are about to copy oldroot to bp, so set up the type - * of bp while we know exactly what it will be. - */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { struct xfs_dir3_icleaf_hdr leafhdr; @@ -707,31 +720,17 @@ xfs_da3_root_split( size = (int)((char *)&leafhdr.ents[leafhdr.count] - (char *)leaf); level = 0; - - /* - * we are about to copy oldroot to bp, so set up the type - * of bp while we know exactly what it will be. - */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); } /* - * we can copy most of the information in the node from one block to - * another, but for CRC enabled headers we have to make sure that the - * block specific identifiers are kept intact. We update the buffer - * directly for this. + * Copy old root to new buffer and log it. */ - memcpy(node, oldroot, size); - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || - oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { - struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; - - node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - } + xfs_da_buf_copy(bp, blk1->bp, size); xfs_trans_log_buf(tp, bp, 0, size - 1); - bp->b_ops = blk1->bp->b_ops; - xfs_trans_buf_copy_type(bp, blk1->bp); + /* + * Update blk1 to point to new buffer. + */ blk1->bp = bp; blk1->blkno = blkno; @@ -1220,21 +1219,14 @@ xfs_da3_root_join( xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* - * This could be copying a leaf back into the root block in the case of - * there only being a single leaf block left in the tree. Hence we have - * to update the b_ops pointer as well to match the buffer type change - * that could occur. For dir3 blocks we also need to update the block - * number in the buffer header. + * Copy child to root buffer and log it. */ - memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); - root_blk->bp->b_ops = bp->b_ops; - xfs_trans_buf_copy_type(root_blk->bp, bp); - if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { - struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; - da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp)); - } + xfs_da_buf_copy(root_blk->bp, bp, args->geo->blksize); xfs_trans_log_buf(args->trans, root_blk->bp, 0, args->geo->blksize - 1); + /* + * Now we can drop the child buffer. + */ error = xfs_da_shrink_inode(args, child, bp); return error; } @@ -2317,9 +2309,10 @@ xfs_da3_swap_lastblock( /* * Copy the last block into the dead buffer and log it. */ - memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + xfs_da_buf_copy(dead_buf, last_buf, args->geo->blksize); xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); dead_info = dead_buf->b_addr; + /* * Get values from the moved block. */ diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ffa3df5b2893..706baf36e175 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -219,6 +219,8 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); +void xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src, + size_t size); uint xfs_da_hashname(const uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index f9015f88eca7..24f9d1461f9a 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -578,20 +578,25 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) #define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ /* - * Entries are packed toward the top as tight as possible. - */ -struct xfs_attr_shortform { - struct xfs_attr_sf_hdr { /* constant-structure header block */ - __be16 totsize; /* total bytes in shortform list */ - __u8 count; /* count of active entries */ - __u8 padding; - } hdr; - struct xfs_attr_sf_entry { - uint8_t namelen; /* actual length of name (no NULL) */ - uint8_t valuelen; /* actual length of value (no NULL) */ - uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - uint8_t nameval[]; /* name & value bytes concatenated */ - } list[]; /* variable sized array */ + * Attribute storage when stored inside the inode. + * + * Small attribute lists are packed as tightly as possible so as to fit into the + * literal area of the inode. + * + * These "shortform" attribute forks consist of a single xfs_attr_sf_hdr header + * followed by zero or more xfs_attr_sf_entry structures. + */ +struct xfs_attr_sf_hdr { /* constant-structure header block */ + __be16 totsize; /* total bytes in shortform list */ + __u8 count; /* count of active entries */ + __u8 padding; +}; + +struct xfs_attr_sf_entry { + __u8 namelen; /* actual length of name (no NULL) */ + __u8 valuelen; /* actual length of value (no NULL) */ + __u8 flags; /* flags bits (XFS_ATTR_*) */ + __u8 nameval[]; /* name & value bytes concatenated */ }; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index f71679ce23b9..66a17910d021 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -26,6 +26,7 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" +#include "xfs_trans_priv.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -181,16 +182,89 @@ static struct kmem_cache *xfs_defer_pending_cache; * Note that the continuation requested between t2 and t3 is likely to * reoccur. */ +STATIC struct xfs_log_item * +xfs_defer_barrier_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return NULL; +} -static const struct xfs_defer_op_type *defer_op_types[] = { - [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, - [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type, - [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, - [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, - [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, - [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, +STATIC void +xfs_defer_barrier_abort_intent( + struct xfs_log_item *intent) +{ + /* empty */ +} + +STATIC struct xfs_log_item * +xfs_defer_barrier_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + return NULL; +} + +STATIC int +xfs_defer_barrier_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +STATIC void +xfs_defer_barrier_cancel_item( + struct list_head *item) +{ + ASSERT(0); +} + +static const struct xfs_defer_op_type xfs_barrier_defer_type = { + .max_items = 1, + .create_intent = xfs_defer_barrier_create_intent, + .abort_intent = xfs_defer_barrier_abort_intent, + .create_done = xfs_defer_barrier_create_done, + .finish_item = xfs_defer_barrier_finish_item, + .cancel_item = xfs_defer_barrier_cancel_item, }; +/* Create a log intent done item for a log intent item. */ +static inline void +xfs_defer_create_done( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + struct xfs_log_item *lip; + + /* If there is no log intent item, there can be no log done item. */ + if (!dfp->dfp_intent) + return; + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the log intent item and frees the log done item + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + if (!lip) + return; + + tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); + dfp->dfp_done = lip; +} + /* * Ensure there's a log intent item associated with this deferred work item if * the operation must be restarted on crash. Returns 1 if there's a log item; @@ -202,18 +276,21 @@ xfs_defer_create_intent( struct xfs_defer_pending *dfp, bool sort) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct xfs_log_item *lip; if (dfp->dfp_intent) return 1; - lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, + sort); if (!lip) return 0; if (IS_ERR(lip)) return PTR_ERR(lip); + tp->t_flags |= XFS_TRANS_DIRTY; + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_intent = lip; return 1; } @@ -245,23 +322,50 @@ xfs_defer_create_intents( return ret; } -STATIC void +static inline void xfs_defer_pending_abort( struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + trace_xfs_defer_pending_abort(mp, dfp); + + if (dfp->dfp_intent && !dfp->dfp_done) { + dfp->dfp_ops->abort_intent(dfp->dfp_intent); + dfp->dfp_intent = NULL; + } +} + +static inline void +xfs_defer_pending_cancel_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + struct list_head *pwi; + struct list_head *n; + + trace_xfs_defer_cancel_list(mp, dfp); + + list_del(&dfp->dfp_list); + list_for_each_safe(pwi, n, &dfp->dfp_work) { + list_del(pwi); + dfp->dfp_count--; + trace_xfs_defer_cancel_item(mp, dfp, pwi); + dfp->dfp_ops->cancel_item(pwi); + } + ASSERT(dfp->dfp_count == 0); + kmem_cache_free(xfs_defer_pending_cache, dfp); +} + +STATIC void +xfs_defer_pending_abort_list( + struct xfs_mount *mp, struct list_head *dop_list) { struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; /* Abort intent items that don't have a done item. */ - list_for_each_entry(dfp, dop_list, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(mp, dfp); - if (dfp->dfp_intent && !dfp->dfp_done) { - ops->abort_intent(dfp->dfp_intent); - dfp->dfp_intent = NULL; - } - } + list_for_each_entry(dfp, dop_list, dfp_list) + xfs_defer_pending_abort(mp, dfp); } /* Abort all the intents that were committed. */ @@ -271,7 +375,7 @@ xfs_defer_trans_abort( struct list_head *dop_pending) { trace_xfs_defer_trans_abort(tp, _RET_IP_); - xfs_defer_pending_abort(tp->t_mountp, dop_pending); + xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); } /* @@ -389,27 +493,31 @@ xfs_defer_cancel_list( { struct xfs_defer_pending *dfp; struct xfs_defer_pending *pli; - struct list_head *pwi; - struct list_head *n; - const struct xfs_defer_op_type *ops; /* * Free the pending items. Caller should already have arranged * for the intent items to be released. */ - list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_cancel_list(mp, dfp); - list_del(&dfp->dfp_list); - list_for_each_safe(pwi, n, &dfp->dfp_work) { - list_del(pwi); - dfp->dfp_count--; - trace_xfs_defer_cancel_item(mp, dfp, pwi); - ops->cancel_item(pwi); - } - ASSERT(dfp->dfp_count == 0); - kmem_cache_free(xfs_defer_pending_cache, dfp); + list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) + xfs_defer_pending_cancel_work(mp, dfp); +} + +static inline void +xfs_defer_relog_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + struct xfs_log_item *lip; + + xfs_defer_create_done(tp, dfp); + + lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); + if (lip) { + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); } + dfp->dfp_done = NULL; + dfp->dfp_intent = lip; } /* @@ -417,7 +525,7 @@ xfs_defer_cancel_list( * done item to release the intent item; and then log a new intent item. * The caller should provide a fresh transaction and roll it after we're done. */ -static int +static void xfs_defer_relog( struct xfs_trans **tpp, struct list_head *dfops) @@ -456,31 +564,28 @@ xfs_defer_relog( trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); XFS_STATS_INC((*tpp)->t_mountp, defer_relog); - dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); - } - if ((*tpp)->t_flags & XFS_TRANS_DIRTY) - return xfs_defer_trans_roll(tpp); - return 0; + xfs_defer_relog_intent(*tpp, dfp); + } } /* * Log an intent-done item for the first pending intent, and finish the work * items. */ -static int +int xfs_defer_finish_one( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + const struct xfs_defer_op_type *ops = dfp->dfp_ops; struct xfs_btree_cur *state = NULL; struct list_head *li, *n; int error; trace_xfs_defer_pending_finish(tp->t_mountp, dfp); - dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + xfs_defer_create_done(tp, dfp); list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; @@ -517,6 +622,24 @@ out: return error; } +/* Move all paused deferred work from @tp to @paused_list. */ +static void +xfs_defer_isolate_paused( + struct xfs_trans *tp, + struct list_head *paused_list) +{ + struct xfs_defer_pending *dfp; + struct xfs_defer_pending *pli; + + list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { + if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) + continue; + + list_move_tail(&dfp->dfp_list, paused_list); + trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); + } +} + /* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if @@ -532,6 +655,7 @@ xfs_defer_finish_noroll( struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); + LIST_HEAD(dop_paused); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -550,6 +674,8 @@ xfs_defer_finish_noroll( */ int has_intents = xfs_defer_create_intents(*tp); + xfs_defer_isolate_paused(*tp, &dop_paused); + list_splice_init(&(*tp)->t_dfops, &dop_pending); if (has_intents < 0) { @@ -562,22 +688,33 @@ xfs_defer_finish_noroll( goto out_shutdown; /* Relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) - goto out_shutdown; + xfs_defer_relog(tp, &dop_pending); + xfs_defer_relog(tp, &dop_paused); + + if ((*tp)->t_flags & XFS_TRANS_DIRTY) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; + } } - dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, - dfp_list); + dfp = list_first_entry_or_null(&dop_pending, + struct xfs_defer_pending, dfp_list); + if (!dfp) + break; error = xfs_defer_finish_one(*tp, dfp); if (error && error != -EAGAIN) goto out_shutdown; } + /* Requeue the paused items in the outgoing transaction. */ + list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); + trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; out_shutdown: + list_splice_tail_init(&dop_paused, &dop_pending); xfs_defer_trans_abort(*tp, &dop_pending); xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); trace_xfs_defer_finish_error(*tp, error); @@ -590,6 +727,9 @@ int xfs_defer_finish( struct xfs_trans **tp) { +#ifdef DEBUG + struct xfs_defer_pending *dfp; +#endif int error; /* @@ -609,7 +749,10 @@ xfs_defer_finish( } /* Reset LOWMODE now that we've finished all the dfops. */ - ASSERT(list_empty(&(*tp)->t_dfops)); +#ifdef DEBUG + list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); +#endif (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } @@ -621,48 +764,165 @@ xfs_defer_cancel( struct xfs_mount *mp = tp->t_mountp; trace_xfs_defer_cancel(tp, _RET_IP_); + xfs_defer_trans_abort(tp, &tp->t_dfops); xfs_defer_cancel_list(mp, &tp->t_dfops); } +/* + * Return the last pending work item attached to this transaction if it matches + * the deferred op type. + */ +static inline struct xfs_defer_pending * +xfs_defer_find_last( + struct xfs_trans *tp, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp = NULL; + + /* No dfops at all? */ + if (list_empty(&tp->t_dfops)) + return NULL; + + dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, + dfp_list); + + /* Wrong type? */ + if (dfp->dfp_ops != ops) + return NULL; + return dfp; +} + +/* + * Decide if we can add a deferred work item to the last dfops item attached + * to the transaction. + */ +static inline bool +xfs_defer_can_append( + struct xfs_defer_pending *dfp, + const struct xfs_defer_op_type *ops) +{ + /* Already logged? */ + if (dfp->dfp_intent) + return false; + + /* Paused items cannot absorb more work */ + if (dfp->dfp_flags & XFS_DEFER_PAUSED) + return NULL; + + /* Already full? */ + if (ops->max_items && dfp->dfp_count >= ops->max_items) + return false; + + return true; +} + +/* Create a new pending item at the end of the transaction list. */ +static inline struct xfs_defer_pending * +xfs_defer_alloc( + struct xfs_trans *tp, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp; + + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); + dfp->dfp_ops = ops; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, &tp->t_dfops); + + return dfp; +} + /* Add an item for later deferred processing. */ -void +struct xfs_defer_pending * xfs_defer_add( struct xfs_trans *tp, - enum xfs_defer_ops_type type, - struct list_head *li) + struct list_head *li, + const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; - const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); - /* - * Add the item to a pending item at the end of the intake list. - * If the last pending item has the same type, reuse it. Else, - * create a new pending item at the end of the intake list. - */ - if (!list_empty(&tp->t_dfops)) { - dfp = list_last_entry(&tp->t_dfops, - struct xfs_defer_pending, dfp_list); - if (dfp->dfp_type != type || - (ops->max_items && dfp->dfp_count >= ops->max_items)) - dfp = NULL; - } - if (!dfp) { - dfp = kmem_cache_zalloc(xfs_defer_pending_cache, - GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_type = type; - dfp->dfp_intent = NULL; - dfp->dfp_done = NULL; - dfp->dfp_count = 0; - INIT_LIST_HEAD(&dfp->dfp_work); - list_add_tail(&dfp->dfp_list, &tp->t_dfops); - } + dfp = xfs_defer_find_last(tp, ops); + if (!dfp || !xfs_defer_can_append(dfp, ops)) + dfp = xfs_defer_alloc(tp, ops); - list_add_tail(li, &dfp->dfp_work); + xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); - dfp->dfp_count++; + return dfp; +} + +/* + * Add a defer ops barrier to force two otherwise adjacent deferred work items + * to be tracked separately and have separate log items. + */ +void +xfs_defer_add_barrier( + struct xfs_trans *tp) +{ + struct xfs_defer_pending *dfp; + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* If the last defer op added was a barrier, we're done. */ + dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); + if (dfp) + return; + + xfs_defer_alloc(tp, &xfs_barrier_defer_type); + + trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); +} + +/* + * Create a pending deferred work item to replay the recovered intent item + * and add it to the list. + */ +void +xfs_defer_start_recovery( + struct xfs_log_item *lip, + struct list_head *r_dfops, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp; + + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); + dfp->dfp_ops = ops; + dfp->dfp_intent = lip; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, r_dfops); +} + +/* + * Cancel a deferred work item created to recover a log intent item. @dfp + * will be freed after this function returns. + */ +void +xfs_defer_cancel_recovery( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + xfs_defer_pending_abort(mp, dfp); + xfs_defer_pending_cancel_work(mp, dfp); +} + +/* Replay the deferred work item created from a recovered log intent item. */ +int +xfs_defer_finish_recovery( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + const struct xfs_defer_op_type *ops = dfp->dfp_ops; + int error; + + /* dfp is freed by recover_work and must not be accessed afterwards */ + error = ops->recover_work(dfp, capture_list); + if (error) + trace_xlog_intent_recovery_failed(mp, ops, error); + return error; } /* @@ -769,7 +1029,7 @@ xfs_defer_ops_capture_abort( { unsigned short i; - xfs_defer_pending_abort(mp, &dfc->dfc_dfops); + xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) @@ -938,3 +1198,36 @@ xfs_defer_destroy_item_caches(void) xfs_rmap_intent_destroy_cache(); xfs_defer_destroy_cache(); } + +/* + * Mark a deferred work item so that it will be requeued indefinitely without + * being finished. Caller must ensure there are no data dependencies on this + * work item in the meantime. + */ +void +xfs_defer_item_pause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); + + dfp->dfp_flags |= XFS_DEFER_PAUSED; + + trace_xfs_defer_item_pause(tp->t_mountp, dfp); +} + +/* + * Release a paused deferred work item so that it will be finished during the + * next transaction roll. + */ +void +xfs_defer_item_unpause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); + + dfp->dfp_flags &= ~XFS_DEFER_PAUSED; + + trace_xfs_defer_item_unpause(tp->t_mountp, dfp); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 8788ad5f6a73..18a9fb92dde8 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -11,19 +11,6 @@ struct xfs_defer_op_type; struct xfs_defer_capture; /* - * Header for deferred operation list. - */ -enum xfs_defer_ops_type { - XFS_DEFER_OPS_TYPE_BMAP, - XFS_DEFER_OPS_TYPE_REFCOUNT, - XFS_DEFER_OPS_TYPE_RMAP, - XFS_DEFER_OPS_TYPE_FREE, - XFS_DEFER_OPS_TYPE_AGFL_FREE, - XFS_DEFER_OPS_TYPE_ATTR, - XFS_DEFER_OPS_TYPE_MAX, -}; - -/* * Save a log intent item and a list of extents, so that we can replay * whatever action had to happen to the extent list and file the log done * item. @@ -33,19 +20,35 @@ struct xfs_defer_pending { struct list_head dfp_work; /* work items */ struct xfs_log_item *dfp_intent; /* log intent item */ struct xfs_log_item *dfp_done; /* log done item */ + const struct xfs_defer_op_type *dfp_ops; unsigned int dfp_count; /* # extent items */ - enum xfs_defer_ops_type dfp_type; + unsigned int dfp_flags; }; -void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type, - struct list_head *h); +/* + * Create a log intent item for this deferred item, but don't actually finish + * the work. Caller must clear this before the final transaction commit. + */ +#define XFS_DEFER_PAUSED (1U << 0) + +#define XFS_DEFER_PENDING_STRINGS \ + { XFS_DEFER_PAUSED, "paused" } + +void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); +void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); + +struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, struct list_head *h, + const struct xfs_defer_op_type *ops); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); +int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp); void xfs_defer_cancel(struct xfs_trans *); void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { + const char *name; + unsigned int max_items; struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort); void (*abort_intent)(struct xfs_log_item *intent); @@ -56,7 +59,11 @@ struct xfs_defer_op_type { void (*finish_cleanup)(struct xfs_trans *tp, struct xfs_btree_cur *state, int error); void (*cancel_item)(struct list_head *item); - unsigned int max_items; + int (*recover_work)(struct xfs_defer_pending *dfp, + struct list_head *capture_list); + struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp, + struct xfs_log_item *intent, + struct xfs_log_item *done_item); }; extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; @@ -125,7 +132,25 @@ void xfs_defer_ops_capture_abort(struct xfs_mount *mp, struct xfs_defer_capture *d); void xfs_defer_resources_rele(struct xfs_defer_resources *dres); +void xfs_defer_start_recovery(struct xfs_log_item *lip, + struct list_head *r_dfops, const struct xfs_defer_op_type *ops); +void xfs_defer_cancel_recovery(struct xfs_mount *mp, + struct xfs_defer_pending *dfp); +int xfs_defer_finish_recovery(struct xfs_mount *mp, + struct xfs_defer_pending *dfp, struct list_head *capture_list); + +static inline void +xfs_defer_add_item( + struct xfs_defer_pending *dfp, + struct list_head *work) +{ + list_add_tail(work, &dfp->dfp_work); + dfp->dfp_count++; +} + int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); +void xfs_defer_add_barrier(struct xfs_trans *tp); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index f5462fd582d5..a76673281514 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -196,7 +196,7 @@ xfs_dir_isempty( return 1; if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return 0; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; return !sfp->count; } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 00f960a703b2..3c256d4cc40b 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1089,7 +1089,7 @@ xfs_dir2_sf_to_block( int newoffset; /* offset from current entry */ unsigned int offset = geo->data_entry_offset; xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ - xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */ + struct xfs_dir2_sf_hdr *oldsfp = ifp->if_data; xfs_dir2_sf_hdr_t *sfp; /* shortform header */ __be16 *tagp; /* end of data entry */ struct xfs_name name; @@ -1099,10 +1099,8 @@ xfs_dir2_sf_to_block( ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); - oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; - ASSERT(ifp->if_bytes == dp->i_disk_size); - ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(oldsfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); ASSERT(dp->i_df.if_nextents == 0); diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 7404a9ff1a92..1db2e60ba827 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -175,7 +175,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); -extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, int64_t size); int xfs_dir2_sf_entsize(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, int len); void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 8cd37e6e9d38..e1f83fc7b6ad 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -364,25 +364,23 @@ int /* error */ xfs_dir2_sf_addname( xfs_da_args_t *args) /* operation arguments */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int error; /* error return value */ int incr_isize; /* total change in size */ int new_isize; /* size after adding name */ int objchange; /* changing to 8-byte inodes */ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ int pick; /* which algorithm to use */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ trace_xfs_dir2_sf_addname(args); ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); - dp = args->dp; ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Compute entry (and change in) size. @@ -462,20 +460,17 @@ xfs_dir2_sf_addname_easy( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - int byteoff; /* byte offset in sf dir */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; + int byteoff = (int)((char *)sfep - (char *)sfp); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - byteoff = (int)((char *)sfep - (char *)sfp); /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen), + sfp = xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen), XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); /* * Fill in the new entry. @@ -528,11 +523,10 @@ xfs_dir2_sf_addname_hard( /* * Copy the old directory to the stack buffer. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_disk_size; buf = kmem_alloc(old_isize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)buf; - memcpy(oldsfp, sfp, old_isize); + memcpy(oldsfp, dp->i_df.if_data, old_isize); /* * Loop over the old directory finding the place we're going * to insert the new entry. @@ -556,11 +550,8 @@ xfs_dir2_sf_addname_hard( * the data. */ xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK); - xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); - /* - * Reset the pointer since the buffer was reallocated. - */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); + /* * Copy the first part of the directory, including the header. */ @@ -610,11 +601,10 @@ xfs_dir2_sf_addname_pick( int i; /* entry number */ xfs_dir2_data_aoff_t offset; /* data block offset */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int size; /* entry's data size */ int used; /* data bytes used */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = xfs_dir2_data_entsize(mp, args->namelen); offset = args->geo->data_first_offset; sfep = xfs_dir2_sf_firstentry(sfp); @@ -673,14 +663,13 @@ xfs_dir2_sf_check( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry number */ int i8count; /* number of big inode#s */ xfs_ino_t ino; /* entry inode number */ int offset; /* data offset */ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; offset = args->geo->data_first_offset; ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; @@ -707,11 +696,10 @@ xfs_dir2_sf_check( /* Verify the consistency of an inline directory. */ xfs_failaddr_t xfs_dir2_sf_verify( - struct xfs_inode *ip) + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, + int64_t size) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; @@ -719,15 +707,9 @@ xfs_dir2_sf_verify( int i; int i8count; int offset; - int64_t size; int error; uint8_t filetype; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - - sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; - size = ifp->if_bytes; - /* * Give up if the directory is way too short. */ @@ -834,15 +816,13 @@ xfs_dir2_sf_create( ASSERT(dp->i_df.if_bytes == 0); i8count = pino > XFS_DIR2_MAX_SHORT_INUM; size = xfs_dir2_sf_hdr_size(i8count); + /* - * Make a buffer for the data. - */ - xfs_idata_realloc(dp, size, XFS_DATA_FORK); - /* - * Fill in the header, + * Make a buffer for the data and fill in the header. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = xfs_idata_realloc(dp, size, XFS_DATA_FORK); sfp->i8count = i8count; + /* * Now can put in the inode number, since i8count is set. */ @@ -864,9 +844,9 @@ xfs_dir2_sf_lookup( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry index */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ @@ -877,8 +857,7 @@ xfs_dir2_sf_lookup( ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Special case for . @@ -940,13 +919,13 @@ xfs_dir2_sf_removename( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int byteoff; /* offset of removed entry */ int entsize; /* this entry's size */ int i; /* shortform entry index */ int newsize; /* new inode size */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ trace_xfs_dir2_sf_removename(args); @@ -954,8 +933,7 @@ xfs_dir2_sf_removename( oldsize = (int)dp->i_disk_size; ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == oldsize); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Loop over the old directory entries. @@ -992,11 +970,12 @@ xfs_dir2_sf_removename( */ sfp->count--; dp->i_disk_size = newsize; + /* * Reallocate, making it smaller. */ - xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); + /* * Are we changing inode number size? */ @@ -1019,13 +998,12 @@ xfs_dir2_sf_replace_needblock( struct xfs_inode *dp, xfs_ino_t inum) { + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int newsize; - struct xfs_dir2_sf_hdr *sfp; if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL) return false; - sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; return inum > XFS_DIR2_MAX_SHORT_INUM && @@ -1041,19 +1019,18 @@ xfs_dir2_sf_replace( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry index */ xfs_ino_t ino=0; /* entry old inode number */ int i8elevated; /* sf_toino8 set i8count=1 */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ trace_xfs_dir2_sf_replace(args); ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* @@ -1076,7 +1053,7 @@ xfs_dir2_sf_replace( */ xfs_dir2_sf_toino8(args); i8elevated = 1; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; } else i8elevated = 0; @@ -1157,11 +1134,11 @@ xfs_dir2_sf_toino4( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data; char *buf; /* old dir's buffer */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ @@ -1175,7 +1152,6 @@ xfs_dir2_sf_toino4( */ oldsize = dp->i_df.if_bytes; buf = kmem_alloc(oldsize, 0); - oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); /* @@ -1188,7 +1164,7 @@ xfs_dir2_sf_toino4( * Reset our pointers, the data has moved. */ oldsfp = (xfs_dir2_sf_hdr_t *)buf; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; /* * Fill in the new header. */ @@ -1230,11 +1206,11 @@ xfs_dir2_sf_toino8( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data; char *buf; /* old dir's buffer */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ @@ -1248,7 +1224,6 @@ xfs_dir2_sf_toino8( */ oldsize = dp->i_df.if_bytes; buf = kmem_alloc(oldsize, 0); - oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* @@ -1261,7 +1236,7 @@ xfs_dir2_sf_toino8( * Reset our pointers, the data has moved. */ oldsfp = (xfs_dir2_sf_hdr_t *)buf; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; /* * Fill in the new header. */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9a88aba1589f..382ab1e71c0b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1008,7 +1008,7 @@ enum xfs_dinode_fmt { * Return pointers to the data or attribute forks. */ #define XFS_DFORK_DPTR(dip) \ - ((char *)dip + xfs_dinode_size(dip->di_version)) + ((void *)dip + xfs_dinode_size(dip->di_version)) #define XFS_DFORK_APTR(dip) \ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ @@ -1156,20 +1156,6 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ #define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ -#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) -#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) - -/* - * RT bit manipulation macros. - */ -#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) -#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) - -#define XFS_RTLOBIT(w) xfs_lowbit32(w) -#define XFS_RTHIBIT(w) xfs_highbit32(w) - -#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) - /* * Dquot and dquot block format definitions */ @@ -1272,6 +1258,9 @@ static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds) #define XFS_DQ_GRACE_MIN ((int64_t)0) #define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX) +/* Maximum id value for a quota record */ +#define XFS_DQ_ID_MAX (U32_MAX) + /* * This is the main portion of the on-disk representation of quota information * for a user. We pad this with some more expansion room to construct the on diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 99e796256c5d..6296993ff8f3 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -68,6 +68,11 @@ struct xfs_fsop_geom; #define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */ #define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */ +#define XFS_SICK_INO_BMBTD_ZAPPED (1 << 8) /* data fork erased */ +#define XFS_SICK_INO_BMBTA_ZAPPED (1 << 9) /* attr fork erased */ +#define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */ +#define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */ + /* Primary evidence of health problems in a given group. */ #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ XFS_SICK_FS_UQUOTA | \ @@ -97,6 +102,11 @@ struct xfs_fsop_geom; XFS_SICK_INO_SYMLINK | \ XFS_SICK_INO_PARENT) +#define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \ + XFS_SICK_INO_BMBTA_ZAPPED | \ + XFS_SICK_INO_DIR_ZAPPED | \ + XFS_SICK_INO_SYMLINK_ZAPPED) + /* These functions must be provided by the xfs implementation. */ void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask); diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b83e54c70906..2361a22035b0 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -95,18 +95,28 @@ xfs_inobt_btrec_to_irec( irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } +/* Compute the freecount of an incore inode record. */ +uint8_t +xfs_inobt_rec_freecount( + const struct xfs_inobt_rec_incore *irec) +{ + uint64_t realfree = irec->ir_free; + + if (xfs_inobt_issparse(irec->ir_holemask)) + realfree &= xfs_inobt_irec_to_allocmask(irec); + return hweight64(realfree); +} + /* Simple checks for inode records. */ xfs_failaddr_t xfs_inobt_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec) { - uint64_t realfree; - /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) + if (!xfs_verify_agino(pag, irec->ir_startino)) return __this_address; - if (!xfs_verify_agino(cur->bc_ag.pag, + if (!xfs_verify_agino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || @@ -115,12 +125,7 @@ xfs_inobt_check_irec( if (irec->ir_freecount > XFS_INODES_PER_CHUNK) return __this_address; - /* if there are no holes, return the first available offset */ - if (!xfs_inobt_issparse(irec->ir_holemask)) - realfree = irec->ir_free; - else - realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec); - if (hweight64(realfree) != irec->ir_freecount) + if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount) return __this_address; return NULL; @@ -164,7 +169,7 @@ xfs_inobt_get_rec( return error; xfs_inobt_btrec_to_irec(mp, rec, irec); - fa = xfs_inobt_check_irec(cur, irec); + fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); @@ -1854,7 +1859,7 @@ xfs_difree_inode_chunk( return xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1900,7 +1905,8 @@ xfs_difree_inode_chunk( ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE); + &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, + false); if (error) return error; @@ -2739,7 +2745,7 @@ xfs_ialloc_count_inodes_rec( xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); - fa = xfs_inobt_check_irec(cur, &irec); + fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index fe824bb04a09..f1412183bb44 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -79,6 +79,7 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, */ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +uint8_t xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore *irec); /* * Inode chunk initialisation routine @@ -93,7 +94,7 @@ union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); -xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_inobt_check_irec(struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9258f01c0015..42a5e1f227a0 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -161,7 +161,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_INOBT, resv); + &XFS_RMAP_OINFO_INOBT, resv, false); } STATIC int diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 773cf4349428..f4e6b200cdf8 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -158,7 +158,7 @@ static void * xfs_iext_find_first_leaf( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height; if (!ifp->if_height) @@ -176,7 +176,7 @@ static void * xfs_iext_find_last_leaf( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; if (!ifp->if_height) @@ -306,7 +306,7 @@ xfs_iext_find_level( xfs_fileoff_t offset, int level) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; if (!ifp->if_height) @@ -402,12 +402,12 @@ xfs_iext_grow( int i; if (ifp->if_height == 1) { - struct xfs_iext_leaf *prev = ifp->if_u1.if_root; + struct xfs_iext_leaf *prev = ifp->if_data; node->keys[0] = xfs_iext_leaf_key(prev, 0); node->ptrs[0] = prev; } else { - struct xfs_iext_node *prev = ifp->if_u1.if_root; + struct xfs_iext_node *prev = ifp->if_data; ASSERT(ifp->if_height > 1); @@ -418,7 +418,7 @@ xfs_iext_grow( for (i = 1; i < KEYS_PER_NODE; i++) node->keys[i] = XFS_IEXT_KEY_INVALID; - ifp->if_u1.if_root = node; + ifp->if_data = node; ifp->if_height++; } @@ -430,7 +430,7 @@ xfs_iext_update_node( int level, void *ptr) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; for (height = ifp->if_height; height > level; height--) { @@ -583,11 +583,11 @@ xfs_iext_alloc_root( { ASSERT(ifp->if_bytes == 0); - ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); + ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); ifp->if_height = 1; /* now that we have a node step into it */ - cur->leaf = ifp->if_u1.if_root; + cur->leaf = ifp->if_data; cur->pos = 0; } @@ -603,9 +603,9 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL); + new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); - ifp->if_u1.if_root = new; + ifp->if_data = new; cur->leaf = new; } @@ -622,13 +622,11 @@ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp) } void -xfs_iext_insert( - struct xfs_inode *ip, +xfs_iext_insert_raw( + struct xfs_ifork *ifp, struct xfs_iext_cursor *cur, - struct xfs_bmbt_irec *irec, - int state) + struct xfs_bmbt_irec *irec) { - struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); xfs_fileoff_t offset = irec->br_startoff; struct xfs_iext_leaf *new = NULL; int nr_entries, i; @@ -662,12 +660,23 @@ xfs_iext_insert( xfs_iext_set(cur_rec(cur), irec); ifp->if_bytes += sizeof(struct xfs_iext_rec); - trace_xfs_iext_insert(ip, cur, state, _RET_IP_); - if (new) xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2); } +void +xfs_iext_insert( + struct xfs_inode *ip, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec, + int state) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + + xfs_iext_insert_raw(ifp, cur, irec); + trace_xfs_iext_insert(ip, cur, state, _RET_IP_); +} + static struct xfs_iext_node * xfs_iext_rebalance_node( struct xfs_iext_node *parent, @@ -777,8 +786,8 @@ again: * If we are at the root and only one entry is left we can just * free this node and update the root pointer. */ - ASSERT(node == ifp->if_u1.if_root); - ifp->if_u1.if_root = node->ptrs[0]; + ASSERT(node == ifp->if_data); + ifp->if_data = node->ptrs[0]; ifp->if_height--; kmem_free(node); } @@ -854,8 +863,8 @@ xfs_iext_free_last_leaf( struct xfs_ifork *ifp) { ifp->if_height--; - kmem_free(ifp->if_u1.if_root); - ifp->if_u1.if_root = NULL; + kmem_free(ifp->if_data); + ifp->if_data = NULL; } void @@ -872,7 +881,7 @@ xfs_iext_remove( trace_xfs_iext_remove(ip, cur, state, _RET_IP_); ASSERT(ifp->if_height > 0); - ASSERT(ifp->if_u1.if_root != NULL); + ASSERT(ifp->if_data != NULL); ASSERT(xfs_iext_valid(ifp, cur)); xfs_iext_inc_seq(ifp); @@ -1042,9 +1051,9 @@ void xfs_iext_destroy( struct xfs_ifork *ifp) { - xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height); + xfs_iext_destroy_node(ifp->if_data, ifp->if_height); ifp->if_bytes = 0; ifp->if_height = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 5a2e7ddfa76d..f4569e18a8d0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -50,12 +50,15 @@ xfs_init_local_fork( mem_size++; if (size) { - ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS); - memcpy(ifp->if_u1.if_data, data, size); + char *new_data = kmem_alloc(mem_size, KM_NOFS); + + memcpy(new_data, data, size); if (zero_terminate) - ifp->if_u1.if_data[size] = '\0'; + new_data[size] = '\0'; + + ifp->if_data = new_data; } else { - ifp->if_u1.if_data = NULL; + ifp->if_data = NULL; } ifp->if_bytes = size; @@ -125,7 +128,7 @@ xfs_iformat_extents( } ifp->if_bytes = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; if (size) { dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); @@ -212,7 +215,7 @@ xfs_iformat_btree( ifp->if_broot, size); ifp->if_bytes = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; return 0; } @@ -276,10 +279,9 @@ static uint16_t xfs_dfork_attr_shortform_size( struct xfs_dinode *dip) { - struct xfs_attr_shortform *atp = - (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip); + struct xfs_attr_sf_hdr *sf = XFS_DFORK_APTR(dip); - return be16_to_cpu(atp->hdr.totsize); + return be16_to_cpu(sf->totsize); } void @@ -493,7 +495,7 @@ xfs_iroot_realloc( * byte_diff -- the change in the number of bytes, positive or negative, * requested for the if_data array. */ -void +void * xfs_idata_realloc( struct xfs_inode *ip, int64_t byte_diff, @@ -505,21 +507,18 @@ xfs_idata_realloc( ASSERT(new_size >= 0); ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork)); - if (byte_diff == 0) - return; - - if (new_size == 0) { - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; - ifp->if_bytes = 0; - return; + if (byte_diff) { + ifp->if_data = krealloc(ifp->if_data, new_size, + GFP_NOFS | __GFP_NOFAIL); + if (new_size == 0) + ifp->if_data = NULL; + ifp->if_bytes = new_size; } - ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size, - GFP_NOFS | __GFP_NOFAIL); - ifp->if_bytes = new_size; + return ifp->if_data; } +/* Free all memory and reset a fork back to its initial state. */ void xfs_idestroy_fork( struct xfs_ifork *ifp) @@ -531,8 +530,8 @@ xfs_idestroy_fork( switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; + kmem_free(ifp->if_data); + ifp->if_data = NULL; break; case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -625,9 +624,9 @@ xfs_iflush_fork( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_data != NULL); ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork)); - memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(cp, ifp->if_data, ifp->if_bytes); } break; @@ -702,19 +701,27 @@ xfs_ifork_verify_local_data( xfs_failaddr_t fa = NULL; switch (VFS_I(ip)->i_mode & S_IFMT) { - case S_IFDIR: - fa = xfs_dir2_sf_verify(ip); + case S_IFDIR: { + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_dir2_sf_hdr *sfp = ifp->if_data; + + fa = xfs_dir2_sf_verify(mp, sfp, ifp->if_bytes); break; - case S_IFLNK: - fa = xfs_symlink_shortform_verify(ip); + } + case S_IFLNK: { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + + fa = xfs_symlink_shortform_verify(ifp->if_data, ifp->if_bytes); break; + } default: break; } if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", - ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa); + ip->i_df.if_data, ip->i_df.if_bytes, fa); return -EFSCORRUPTED; } @@ -729,14 +736,17 @@ xfs_ifork_verify_local_attr( struct xfs_ifork *ifp = &ip->i_af; xfs_failaddr_t fa; - if (!xfs_inode_has_attr_fork(ip)) + if (!xfs_inode_has_attr_fork(ip)) { fa = __this_address; - else - fa = xfs_attr_shortform_verify(ip); + } else { + struct xfs_ifork *ifp = &ip->i_af; + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + fa = xfs_attr_shortform_verify(ifp->if_data, ifp->if_bytes); + } if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp->if_u1.if_data, ifp->if_bytes, fa); + ifp->if_data, ifp->if_bytes, fa); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 96d307784c85..96303249d28a 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -13,14 +13,12 @@ struct xfs_dinode; * File incore extent information, present for each of data & attr forks. */ struct xfs_ifork { - int64_t if_bytes; /* bytes in if_u1 */ + int64_t if_bytes; /* bytes in if_data */ struct xfs_btree_block *if_broot; /* file's incore btree root */ unsigned int if_seq; /* fork mod counter */ int if_height; /* height of the extent tree */ - union { - void *if_root; /* extent tree root */ - char *if_data; /* inline file data */ - } if_u1; + void *if_data; /* extent tree root or + inline data */ xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ @@ -170,7 +168,7 @@ int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_ifork *ifp); -void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, +void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); void xfs_iroot_realloc(struct xfs_inode *, int, int); int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); @@ -180,6 +178,9 @@ void xfs_init_local_fork(struct xfs_inode *ip, int whichfork, const void *data, int64_t size); xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp); +void xfs_iext_insert_raw(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec); void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *, int); void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *, diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index a5100a11faf9..9fe7a9564bca 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -11,6 +11,7 @@ * define how recovery should work for that type of log item. */ struct xlog_recover_item; +struct xfs_defer_op_type; /* Sorting hat for log items as they're read in. */ enum xlog_recover_reorder { @@ -153,4 +154,11 @@ xlog_recover_resv(const struct xfs_trans_res *r) return ret; } +struct xfs_defer_pending; + +void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, + xfs_lsn_t lsn, const struct xfs_defer_op_type *ops); +int xlog_recover_finish_intent(struct xfs_trans *tp, + struct xfs_defer_pending *dfp); + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 21a7e350b4c5..81885a6a028e 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -7,16 +7,16 @@ #define __XFS_ONDISK_H #define XFS_CHECK_STRUCT_SIZE(structname, size) \ - BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ - #structname ") is wrong, expected " #size) + static_assert(sizeof(structname) == (size), \ + "XFS: sizeof(" #structname ") is wrong, expected " #size) #define XFS_CHECK_OFFSET(structname, member, off) \ - BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \ + static_assert(offsetof(structname, member) == (off), \ "XFS: offsetof(" #structname ", " #member ") is wrong, " \ "expected " #off) #define XFS_CHECK_VALUE(value, expected) \ - BUILD_BUG_ON_MSG((value) != (expected), \ + static_assert((value) == (expected), \ "XFS: value of " #value " is wrong, expected " #expected) static inline void __init @@ -93,13 +93,13 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); - XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4); + XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0); + XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, namelen, 0); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 646b3fa362ad..6709a7f8bad5 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -123,11 +123,9 @@ xfs_refcount_btrec_to_irec( /* Simple checks for refcount records. */ xfs_failaddr_t xfs_refcount_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_refcount_irec *irec) { - struct xfs_perag *pag = cur->bc_ag.pag; - if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) return __this_address; @@ -179,7 +177,7 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - fa = xfs_refcount_check_irec(cur, irec); + fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); @@ -1153,7 +1151,7 @@ xfs_refcount_adjust_extents( tmp.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1215,7 +1213,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1458,7 +1456,7 @@ __xfs_refcount_add( ri->ri_blockcount = blockcount; xfs_refcount_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); + xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); } /* @@ -1899,7 +1897,7 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL || + if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { kfree(rr); @@ -1985,7 +1983,7 @@ xfs_refcount_recover_cow_leftovers( /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_trans; @@ -2033,6 +2031,47 @@ xfs_refcount_has_records( return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } +struct xfs_refcount_query_range_info { + xfs_refcount_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_refcount_query_range_helper( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_refcount_query_range_info *query = priv; + struct xfs_refcount_irec irec; + xfs_failaddr_t fa; + + xfs_refcount_btrec_to_irec(rec, &irec); + fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec); + if (fa) + return xfs_refcount_complain_bad_rec(cur, fa, &irec); + + return query->fn(cur, &irec, query->priv); +} + +/* Find all refcount records between two keys. */ +int +xfs_refcount_query_range( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *low_rec, + const struct xfs_refcount_irec *high_rec, + xfs_refcount_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_brec = { .rc = *low_rec }; + union xfs_btree_irec high_brec = { .rc = *high_rec }; + struct xfs_refcount_query_range_info query = { .priv = priv, .fn = fn }; + + return xfs_btree_query_range(cur, &low_brec, &high_brec, + xfs_refcount_query_range_helper, &query); +} + int __init xfs_refcount_intent_init_cache(void) { diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 783cd89ca195..9b56768a590c 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -117,7 +117,7 @@ extern int xfs_refcount_has_records(struct xfs_btree_cur *cur, union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); -xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag, const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); @@ -127,4 +127,14 @@ extern struct kmem_cache *xfs_refcount_intent_cache; int __init xfs_refcount_intent_init_cache(void); void xfs_refcount_intent_destroy_cache(void); +typedef int (*xfs_refcount_query_range_fn)( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv); + +int xfs_refcount_query_range(struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *low_rec, + const struct xfs_refcount_irec *high_rec, + xfs_refcount_query_range_fn fn, void *priv); + #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 5c3987d8dc24..0d80bd99147c 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -112,7 +112,7 @@ xfs_refcountbt_free_block( be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false); } STATIC int @@ -226,7 +226,18 @@ xfs_refcountbt_verify( level = be16_to_cpu(block->bb_level); if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_refcount_level) + unsigned int maxlevel = pag->pagf_refcount_level; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the refcount btree, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_refcount_level); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_refc_maxlevels) return __this_address; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index fbb0b2637463..76bf7f48cb5a 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2567,7 +2567,7 @@ __xfs_rmap_add( ri->ri_bmap = *bmap; xfs_rmap_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); + xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); } /* Map an extent into a file. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index c269d704314d..31100120b2c5 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -184,7 +184,7 @@ xfs_rtfind_back( * Calculate first (leftmost) bit number to look at, * and mask for all the relevant bits in this word. */ - firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + firstbit = max_t(xfs_srtblock_t, bit - len + 1, 0); mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << firstbit; /* @@ -195,7 +195,7 @@ xfs_rtfind_back( /* * Different. Mark where we are and return. */ - i = bit - XFS_RTHIBIT(wdiff); + i = bit - xfs_highbit32(wdiff); *rtx = start - i + 1; return 0; } @@ -233,7 +233,7 @@ xfs_rtfind_back( /* * Different, mark where we are and return. */ - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + i += XFS_NBWORD - 1 - xfs_highbit32(wdiff); *rtx = start - i + 1; return 0; } @@ -272,7 +272,7 @@ xfs_rtfind_back( /* * Different, mark where we are and return. */ - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + i += XFS_NBWORD - 1 - xfs_highbit32(wdiff); *rtx = start - i + 1; return 0; } else @@ -338,7 +338,7 @@ xfs_rtfind_forw( * Calculate last (rightmost) bit number to look at, * and mask for all the relevant bits in this word. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; /* * Calculate the difference between the value there @@ -348,7 +348,7 @@ xfs_rtfind_forw( /* * Different. Mark where we are and return. */ - i = XFS_RTLOBIT(wdiff) - bit; + i = xfs_lowbit32(wdiff) - bit; *rtx = start + i - 1; return 0; } @@ -386,7 +386,7 @@ xfs_rtfind_forw( /* * Different, mark where we are and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *rtx = start + i - 1; return 0; } @@ -423,7 +423,7 @@ xfs_rtfind_forw( /* * Different, mark where we are and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *rtx = start + i - 1; return 0; } else @@ -452,71 +452,59 @@ xfs_trans_log_rtsummary( } /* - * Read and/or modify the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - * - * Summary information is returned in *sum if specified. - * If no delta is specified, returns summary only. + * Modify the summary information for a given extent size, bitmap block + * combination. */ int -xfs_rtmodify_summary_int( +xfs_rtmodify_summary( struct xfs_rtalloc_args *args, int log, /* log2 of extent size */ xfs_fileoff_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - xfs_suminfo_t *sum) /* out: summary info for this block */ + int delta) /* in/out: summary block number */ { struct xfs_mount *mp = args->mp; - int error; - xfs_fileoff_t sb; /* summary fsblock */ - xfs_rtsumoff_t so; /* index into the summary file */ + xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); unsigned int infoword; + xfs_suminfo_t val; + int error; - /* - * Compute entry number in the summary file. - */ - so = xfs_rtsumoffs(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = xfs_rtsumoffs_to_block(mp, so); - - error = xfs_rtsummary_read_buf(args, sb); + error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so)); if (error) return error; - /* - * Point to the summary information, modify/log it, and/or copy it out. - */ infoword = xfs_rtsumoffs_to_infoword(mp, so); - if (delta) { - xfs_suminfo_t val = xfs_suminfo_add(args, infoword, delta); - - if (mp->m_rsum_cache) { - if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; - if (val != 0 && log >= mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log + 1; - } - xfs_trans_log_rtsummary(args, infoword); - if (sum) - *sum = val; - } else if (sum) { - *sum = xfs_suminfo_get(args, infoword); + val = xfs_suminfo_add(args, infoword, delta); + + if (mp->m_rsum_cache) { + if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log; + if (val != 0 && log >= mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log + 1; } + + xfs_trans_log_rtsummary(args, infoword); return 0; } +/* + * Read and return the summary information for a given extent size, bitmap block + * combination. + */ int -xfs_rtmodify_summary( +xfs_rtget_summary( struct xfs_rtalloc_args *args, int log, /* log2 of extent size */ xfs_fileoff_t bbno, /* bitmap block number */ - int delta) /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ { - return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL); + struct xfs_mount *mp = args->mp; + xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); + int error; + + error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so)); + if (!error) + *sum = xfs_suminfo_get(args, xfs_rtsumoffs_to_infoword(mp, so)); + return error; } /* Log rtbitmap block from the word @from to the byte before @next. */ @@ -585,7 +573,7 @@ xfs_rtmodify_range( /* * Compute first bit not changed and mask of relevant bits. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; /* * Set/clear the active bits. @@ -720,7 +708,7 @@ xfs_rtfree_range( */ if (preblock < start) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(start - preblock), + xfs_highbit64(start - preblock), xfs_rtx_to_rbmblock(mp, preblock), -1); if (error) { return error; @@ -732,7 +720,7 @@ xfs_rtfree_range( */ if (postblock > end) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock - end), + xfs_highbit64(postblock - end), xfs_rtx_to_rbmblock(mp, end + 1), -1); if (error) { return error; @@ -743,7 +731,7 @@ xfs_rtfree_range( * (new) free extent. */ return xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock + 1 - preblock), + xfs_highbit64(postblock + 1 - preblock), xfs_rtx_to_rbmblock(mp, preblock), 1); } @@ -799,7 +787,7 @@ xfs_rtcheck_range( /* * Compute first bit not examined. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); /* * Mask of relevant bits. */ @@ -812,7 +800,7 @@ xfs_rtcheck_range( /* * Different, compute first wrong bit and return. */ - i = XFS_RTLOBIT(wdiff) - bit; + i = xfs_lowbit32(wdiff) - bit; *new = start + i; *stat = 0; return 0; @@ -851,7 +839,7 @@ xfs_rtcheck_range( /* * Different, compute first wrong bit and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *new = start + i; *stat = 0; return 0; @@ -889,7 +877,7 @@ xfs_rtcheck_range( /* * Different, compute first wrong bit and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *new = start + i; *stat = 0; return 0; @@ -1131,6 +1119,20 @@ xfs_rtbitmap_blockcount( } /* + * Compute the maximum level number of the realtime summary file, as defined by + * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct + * use of rt volumes with more than 2^32 extents. + */ +uint8_t +xfs_compute_rextslog( + xfs_rtbxlen_t rtextents) +{ + if (!rtextents) + return 0; + return xfs_highbit64(rtextents); +} + +/* * Compute the number of rtbitmap words needed to populate every block of a * bitmap that is large enough to track the given number of rt extents. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index c0637057d69c..274dc7dae1fa 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -321,8 +321,8 @@ int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len, int val); -int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log, - xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum); +int xfs_rtget_summary(struct xfs_rtalloc_args *args, int log, + xfs_fileoff_t bbno, xfs_suminfo_t *sum); int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, int delta); int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, @@ -351,6 +351,20 @@ xfs_rtfree_extent( int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, xfs_filblks_t rtlen); +uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); + +/* Do we support an rt volume having this number of rtextents? */ +static inline bool +xfs_validate_rtextents( + xfs_rtbxlen_t rtextents) +{ + /* No runt rt volumes */ + if (rtextents == 0) + return false; + + return true; +} + xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, @@ -369,6 +383,8 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) +# define xfs_compute_rextslog(rtx) (0) +# define xfs_validate_rtextents(rtx) (false) static inline xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 1f74d0cd1618..4a9e8588f4c9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -25,6 +25,7 @@ #include "xfs_da_format.h" #include "xfs_health.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -508,8 +509,9 @@ xfs_validate_sb_common( rbmblocks = howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize); - if (sbp->sb_rextents != rexts || - sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + if (!xfs_validate_rtextents(rexts) || + sbp->sb_rextents != rexts || + sbp->sb_rextslog != xfs_compute_rextslog(rexts) || sbp->sb_rbmblocks != rbmblocks) { xfs_notice(mp, "realtime geometry sanity check failed"); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c4381388c0c1..4220d3584c1b 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -139,7 +139,7 @@ bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp); -xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); /* Computed inode geometry for the filesystem. */ struct xfs_ino_geometry { diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index bdc777b9ec4a..160aa20aa441 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -175,7 +175,7 @@ xfs_symlink_local_to_remote( if (!xfs_has_crc(mp)) { bp->b_ops = NULL; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(bp->b_addr, ifp->if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); return; } @@ -191,7 +191,7 @@ xfs_symlink_local_to_remote( buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); - memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(buf, ifp->if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); } @@ -202,15 +202,11 @@ xfs_symlink_local_to_remote( */ xfs_failaddr_t xfs_symlink_shortform_verify( - struct xfs_inode *ip) + void *sfp, + int64_t size) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - char *sfp = (char *)ifp->if_u1.if_data; - int size = ifp->if_bytes; char *endp = sfp + size; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - /* * Zero length symlinks should never occur in memory as they are * never allowed to exist on disk. diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 533200c4ccc2..20b5375f2d9c 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -51,7 +51,6 @@ typedef void * xfs_failaddr_t; #define NULLRFSBLOCK ((xfs_rfsblock_t)-1) #define NULLRTBLOCK ((xfs_rtblock_t)-1) #define NULLFILEOFF ((xfs_fileoff_t)-1) -#define NULLRTEXTNO ((xfs_rtxnum_t)-1) #define NULLAGBLOCK ((xfs_agblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) @@ -208,6 +207,13 @@ enum xfs_ag_resv_type { XFS_AG_RESV_AGFL, XFS_AG_RESV_METADATA, XFS_AG_RESV_RMAPBT, + + /* + * Don't increase fdblocks when freeing extent. This is a pony for + * the bnobt repair functions to re-free the free space without + * altering fdblocks. If you think you need this you're wrong. + */ + XFS_AG_RESV_IGNORE, }; /* Results of scanning a btree keyspace to check occupancy. */ diff --git a/fs/xfs/scrub/agb_bitmap.c b/fs/xfs/scrub/agb_bitmap.c new file mode 100644 index 000000000000..573e4e062754 --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "bitmap.h" +#include "scrub/agb_bitmap.h" + +/* + * Record all btree blocks seen while iterating all records of a btree. + * + * We know that the btree query_all function starts at the left edge and walks + * towards the right edge of the tree. Therefore, we know that we can walk up + * the btree cursor towards the root; if the pointer for a given level points + * to the first record/key in that block, we haven't seen this block before; + * and therefore we need to remember that we saw this block in the btree. + * + * So if our btree is: + * + * 4 + * / | \ + * 1 2 3 + * + * Pretend for this example that each leaf block has 100 btree records. For + * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we + * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so + * we record block 4. The list is [1, 4]. + * + * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit + * the loop. The list remains [1, 4]. + * + * For the 101st btree record, we've moved onto leaf block 2. Now + * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that + * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. + * + * For the 102nd record, bc_levels[0].ptr == 2, so we continue. + * + * For the 201st record, we've moved on to leaf block 3. + * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. + * + * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + */ + +/* Mark a btree block to the agblock bitmap. */ +STATIC int +xagb_bitmap_visit_btblock( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +int +xagb_bitmap_set_btblocks( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + +/* + * Record all the buffers pointed to by the btree cursor. Callers already + * engaged in a btree walk should call this function to capture the list of + * blocks going from the leaf towards the root. + */ +int +xagb_bitmap_set_btcur_path( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + int i; + int error; + + for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { + error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h new file mode 100644 index 000000000000..ed08f76ff4f3 --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.h @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_AGB_BITMAP_H__ +#define __XFS_SCRUB_AGB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_agblock_t */ + +struct xagb_bitmap { + struct xbitmap32 agbitmap; +}; + +static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->agbitmap); +} + +static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_clear(&bitmap->agbitmap, start, len); +} +static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->agbitmap, start, len); +} + +static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t *len) +{ + return xbitmap32_test(&bitmap->agbitmap, start, len); +} + +static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, + struct xagb_bitmap *sub) +{ + return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap); +} + +static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) +{ + return xbitmap32_hweight(&bitmap->agbitmap); +} +static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) +{ + return xbitmap32_empty(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->agbitmap, fn, priv); +} + +int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); +int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); + +#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 876a2f41b063..26bd1ff68f1b 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -26,6 +26,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" #include "scrub/reap.h" /* Superblock */ @@ -72,7 +73,7 @@ xrep_superblock( /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); - return error; + return 0; } /* AGF */ @@ -341,7 +342,7 @@ xrep_agf_commit_new( pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGF. v5 filesystems only. */ @@ -494,12 +495,11 @@ xrep_agfl_walk_rmap( /* Strike out the blocks that are cross-linked according to the rmapbt. */ STATIC int xrep_agfl_check_extent( - uint64_t start, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xrep_agfl *ra = priv; - xfs_agblock_t agbno = start; xfs_agblock_t last_agbno = agbno + len - 1; int error; @@ -647,8 +647,8 @@ struct xrep_agfl_fill { /* Fill the AGFL with whatever blocks are in this extent. */ static int xrep_agfl_fill( - uint64_t start, - uint64_t len, + uint32_t start, + uint32_t len, void *priv) { struct xrep_agfl_fill *af = priv; @@ -789,6 +789,9 @@ xrep_agfl( /* Dump any AGFL overflow. */ error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); + if (error) + goto err; + err: xagb_bitmap_destroy(&agfl_extents); return error; @@ -962,7 +965,7 @@ xrep_agi_commit_new( pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGI. */ diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 279af72b1671..d1b8a4997dd2 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -9,13 +9,16 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_btree.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_ag.h" +#include "scrub/repair.h" /* * Set us up to scrub free space btrees. @@ -24,10 +27,19 @@ int xchk_setup_ag_allocbt( struct xfs_scrub *sc) { + int error; + if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); - return xchk_setup_ag_btree(sc, false); + error = xchk_setup_ag_btree(sc, false); + if (error) + return error; + + if (xchk_could_repair(sc)) + return xrep_setup_ag_allocbt(sc); + + return 0; } /* Free space btree scrubber. */ @@ -127,7 +139,7 @@ xchk_allocbt_rec( struct xchk_alloc *ca = bs->private; xfs_alloc_btrec_to_irec(rec, &irec); - if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) { + if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -138,31 +150,27 @@ xchk_allocbt_rec( return 0; } -/* Scrub the freespace btrees for some AG. */ -STATIC int +/* Scrub one of the freespace btrees for some AG. */ +int xchk_allocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xchk_alloc ca = { }; struct xfs_btree_cur *cur; - cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; - return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); -} - -int -xchk_bnobt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_BNO); -} + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_BNOBT: + cur = sc->sa.bno_cur; + break; + case XFS_SCRUB_TYPE_CNTBT: + cur = sc->sa.cnt_cur; + break; + default: + ASSERT(0); + return -EIO; + } -int -xchk_cntbt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_CNT); + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); } /* xref check that the extent is not free */ diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c new file mode 100644 index 000000000000..45edda096869 --- /dev/null +++ b/fs/xfs/scrub/alloc_repair.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_refcount.h" +#include "xfs_extent_busy.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Free Space Btree Repair + * ======================= + * + * The reverse mappings are supposed to record all space usage for the entire + * AG. Therefore, we can recreate the free extent records in an AG by looking + * for gaps in the physical extents recorded in the rmapbt. These records are + * staged in @free_records. Identifying the gaps is more difficult on a + * reflink filesystem because rmap records are allowed to overlap. + * + * Because the final step of building a new index is to free the space used by + * the old index, repair needs to find that space. Unfortunately, all + * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share + * the same rmapbt owner code (OWN_AG), so this is not straightforward. + * + * The scan of the reverse mapping information records the space used by OWN_AG + * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While + * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to + * record all visited rmap btree blocks and all blocks owned by the AGFL. + * + * After that is where the definitions of old_allocbt_blocks shifts. This + * expression identifies possible former bnobt/cntbt blocks: + * + * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); + * + * Substituting from above definitions, that becomes: + * + * old_allocbt_blocks & ~not_allocbt_blocks + * + * The OWN_AG bitmap itself isn't needed after this point, so what we really do + * instead is: + * + * old_allocbt_blocks &= ~not_allocbt_blocks; + * + * After this point, @old_allocbt_blocks is a bitmap of alleged former + * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first + * parameter in place to avoid copying records around. + * + * Next, some of the space described by @free_records are diverted to the newbt + * reservation and used to format new btree blocks. The remaining records are + * written to the new btree indices. We reconstruct both bnobt and cntbt at + * the same time since we've already done all the work. + * + * We use the prefix 'xrep_abt' here because we regenerate both free space + * allocation btrees at the same time. + */ + +struct xrep_abt { + /* Blocks owned by the rmapbt or the agfl. */ + struct xagb_bitmap not_allocbt_blocks; + + /* All OWN_AG blocks. */ + struct xagb_bitmap old_allocbt_blocks; + + /* + * New bnobt information. All btree block reservations are added to + * the reservation list in new_bnobt. + */ + struct xrep_newbt new_bnobt; + + /* new cntbt information */ + struct xrep_newbt new_cntbt; + + /* Free space extents. */ + struct xfarray *free_records; + + struct xfs_scrub *sc; + + /* Number of non-null records in @free_records. */ + uint64_t nr_real_records; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* + * Next block we anticipate seeing in the rmap records. If the next + * rmap record is greater than next_agbno, we have found unused space. + */ + xfs_agblock_t next_agbno; + + /* Number of free blocks in this AG. */ + xfs_agblock_t nr_blocks; + + /* Longest free extent we found in the AG. */ + xfs_agblock_t longest; +}; + +/* Set up to repair AG free space btrees. */ +int +xrep_setup_ag_allocbt( + struct xfs_scrub *sc) +{ + unsigned int busy_gen; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. + */ + busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); + if (xfs_extent_busy_list_empty(sc->sa.pag)) + return 0; + + return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); +} + +/* Check for any obvious conflicts in the free extent. */ +STATIC int +xrep_abt_check_free_ext( + struct xfs_scrub *sc, + const struct xfs_alloc_rec_incore *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->ar_startblock, rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be shared or CoW staging. */ + if (sc->sa.refc_cur) { + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_SHARED, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_COW, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Stash a free space record for all the space since the last bno we found + * all the way up to @end. + */ +static int +xrep_abt_stash( + struct xrep_abt *ra, + xfs_agblock_t end) +{ + struct xfs_alloc_rec_incore arec = { + .ar_startblock = ra->next_agbno, + .ar_blockcount = end - ra->next_agbno, + }; + struct xfs_scrub *sc = ra->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xrep_abt_check_free_ext(ra->sc, &arec); + if (error) + return error; + + trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); + + error = xfarray_append(ra->free_records, &arec); + if (error) + return error; + + ra->nr_blocks += arec.ar_blockcount; + return 0; +} + +/* Record extents that aren't in use from gaps in the rmap records. */ +STATIC int +xrep_abt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_abt *ra = priv; + int error; + + /* Record all the OWN_AG blocks... */ + if (rec->rm_owner == XFS_RMAP_OWN_AG) { + error = xagb_bitmap_set(&ra->old_allocbt_blocks, + rec->rm_startblock, rec->rm_blockcount); + if (error) + return error; + } + + /* ...and all the rmapbt blocks... */ + error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); + if (error) + return error; + + /* ...and all the free space. */ + if (rec->rm_startblock > ra->next_agbno) { + error = xrep_abt_stash(ra, rec->rm_startblock); + if (error) + return error; + } + + /* + * rmap records can overlap on reflink filesystems, so project + * next_agbno as far out into the AG space as we currently know about. + */ + ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* Collect an AGFL block for the not-to-release list. */ +static int +xrep_abt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1); +} + +/* + * Compare two free space extents by block number. We want to sort in order of + * increasing block number. + */ +static int +xrep_bnobt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_startblock > bp->ar_startblock) + return 1; + else if (ap->ar_startblock < bp->ar_startblock) + return -1; + return 0; +} + +/* + * Re-sort the free extents by block number so that we can put the records into + * the bnobt in the correct order. Make sure the records do not overlap in + * physical space. + */ +STATIC int +xrep_bnobt_sort_records( + struct xrep_abt *ra) +{ + struct xfs_alloc_rec_incore arec; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); + if (error) + return error; + + while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { + if (arec.ar_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = arec.ar_startblock + arec.ar_blockcount; + } + + return error; +} + +/* + * Compare two free space extents by length and then block number. We want + * to sort first in order of increasing length and then in order of increasing + * block number. + */ +static int +xrep_cntbt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_blockcount > bp->ar_blockcount) + return 1; + else if (ap->ar_blockcount < bp->ar_blockcount) + return -1; + return xrep_bnobt_extent_cmp(a, b); +} + +/* + * Sort the free extents by length so so that we can put the records into the + * cntbt in the correct order. Don't let userspace kill us if we're resorting + * after allocating btree blocks. + */ +STATIC int +xrep_cntbt_sort_records( + struct xrep_abt *ra, + bool is_resort) +{ + return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp, + is_resort ? 0 : XFARRAY_SORT_KILLABLE); +} + +/* + * Iterate all reverse mappings to find (1) the gaps between rmap records (all + * unowned space), (2) the OWN_AG extents (which encompass the free space + * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL + * blocks. The free space is (1) + (2) - (3) - (4). + */ +STATIC int +xrep_abt_find_freespace( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_buf *agfl_bp; + xfs_agblock_t agend; + int error; + + xagb_bitmap_init(&ra->not_allocbt_blocks); + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Iterate all the reverse mappings to find gaps in the physical + * mappings, all the OWN_AG blocks, and all the rmapbt extents. + */ + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); + if (error) + goto err; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(agf->agf_length); + if (ra->next_agbno < agend) { + error = xrep_abt_stash(ra, agend); + if (error) + goto err; + } + + /* Collect all the AGFL blocks. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + goto err; + + error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra); + if (error) + goto err_agfl; + + /* Compute the old bnobt/cntbt blocks. */ + error = xagb_bitmap_disunion(&ra->old_allocbt_blocks, + &ra->not_allocbt_blocks); + if (error) + goto err_agfl; + + ra->nr_real_records = xfarray_length(ra->free_records); +err_agfl: + xfs_trans_brelse(sc->tp, agfl_bp); +err: + xchk_ag_btcur_free(&sc->sa); + xagb_bitmap_destroy(&ra->not_allocbt_blocks); + return error; +} + +/* + * We're going to use the observed free space records to reserve blocks for the + * new free space btrees, so we play an iterative game where we try to converge + * on the number of blocks we need: + * + * 1. Estimate how many blocks we'll need to store the records. + * 2. If the first free record has more blocks than we need, we're done. + * We will have to re-sort the records prior to building the cntbt. + * 3. If that record has exactly the number of blocks we need, null out the + * record. We're done. + * 4. Otherwise, we still need more blocks. Null out the record, subtract its + * length from the number of blocks we need, and go back to step 1. + * + * Fortunately, we don't have to do any transaction work to play this game, so + * we don't have to tear down the staging cursors. + */ +STATIC int +xrep_abt_reserve_space( + struct xrep_abt *ra, + struct xfs_btree_cur *bno_cur, + struct xfs_btree_cur *cnt_cur, + bool *needs_resort) +{ + struct xfs_scrub *sc = ra->sc; + xfarray_idx_t record_nr; + unsigned int allocated = 0; + int error = 0; + + record_nr = xfarray_length(ra->free_records) - 1; + do { + struct xfs_alloc_rec_incore arec; + uint64_t required; + unsigned int desired; + unsigned int len; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(cnt_cur, + &ra->new_cntbt.bload, ra->nr_real_records); + if (error) + break; + + error = xfs_btree_bload_compute_geometry(bno_cur, + &ra->new_bnobt.bload, ra->nr_real_records); + if (error) + break; + + /* How many btree blocks do we need to store all records? */ + required = ra->new_bnobt.bload.nr_blocks + + ra->new_cntbt.bload.nr_blocks; + ASSERT(required < INT_MAX); + + /* If we've reserved enough blocks, we're done. */ + if (allocated >= required) + break; + + desired = required - allocated; + + /* We need space but there's none left; bye! */ + if (ra->nr_real_records == 0) { + error = -ENOSPC; + break; + } + + /* Grab the first record from the list. */ + error = xfarray_load(ra->free_records, record_nr, &arec); + if (error) + break; + + ASSERT(arec.ar_blockcount <= UINT_MAX); + len = min_t(unsigned int, arec.ar_blockcount, desired); + + trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, + arec.ar_startblock, len, XFS_RMAP_OWN_AG); + + error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, + arec.ar_startblock, len); + if (error) + break; + allocated += len; + ra->nr_blocks -= len; + + if (arec.ar_blockcount > desired) { + /* + * Record has more space than we need. The number of + * free records doesn't change, so shrink the free + * record, inform the caller that the records are no + * longer sorted by length, and exit. + */ + arec.ar_startblock += desired; + arec.ar_blockcount -= desired; + error = xfarray_store(ra->free_records, record_nr, + &arec); + if (error) + break; + + *needs_resort = true; + return 0; + } + + /* + * We're going to use up the entire record, so unset it and + * move on to the next one. This changes the number of free + * records (but doesn't break the sorting order), so we must + * go around the loop once more to re-run _bload_init. + */ + error = xfarray_unset(ra->free_records, record_nr); + if (error) + break; + ra->nr_real_records--; + record_nr--; + } while (1); + + return error; +} + +STATIC int +xrep_abt_dispose_one( + struct xrep_abt *ra, + struct xrep_newbt_resv *resv) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + xfs_agblock_t free_agbno = resv->agbno + resv->used; + xfs_extlen_t free_aglen = resv->len - resv->used; + int error; + + ASSERT(pag == resv->pag); + + /* Add a deferred rmap for each extent we used. */ + if (resv->used > 0) + xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, + resv->used, XFS_RMAP_OWN_AG); + + /* + * For each reserved btree block we didn't use, add it to the free + * space btree. We didn't touch fdblocks when we reserved them, so + * we don't touch it now. + */ + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, ra->new_bnobt.oinfo.oi_owner); + + error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, + &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); + if (error) + return error; + + return xrep_defer_finish(sc); +} + +/* + * Deal with all the space we reserved. Blocks that were allocated for the + * free space btrees need to have a (deferred) rmap added for the OWN_AG + * allocation, and blocks that didn't get used can be freed via the usual + * (deferred) means. + */ +STATIC void +xrep_abt_dispose_reservations( + struct xrep_abt *ra, + int error) +{ + struct xrep_newbt_resv *resv, *n; + + if (error) + goto junkit; + + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + error = xrep_abt_dispose_one(ra, resv); + if (error) + goto junkit; + } + +junkit: + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + xfs_perag_put(resv->pag); + list_del(&resv->list); + kfree(resv); + } + + xrep_newbt_cancel(&ra->new_bnobt); + xrep_newbt_cancel(&ra->new_cntbt); +} + +/* Retrieve free space data for bulk load. */ +STATIC int +xrep_abt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; + struct xrep_abt *ra = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load_next(ra->free_records, &ra->array_cur, + arec); + if (error) + return error; + + ra->longest = max(ra->longest, arec->ar_blockcount); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_abt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr); +} + +/* + * Reset the AGF counters to reflect the free space btrees that we just + * rebuilt, then reinitialize the per-AG data. + */ +STATIC int +xrep_abt_reset_counters( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + unsigned int freesp_btreeblks = 0; + + /* + * Compute the contribution to agf_btreeblks for the new free space + * btrees. This is the computed btree size minus anything we didn't + * use. + */ + freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1; + freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1; + + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt); + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt); + + /* + * The AGF header contains extra information related to the free space + * btrees, so we must update those fields here. + */ + agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + + (be32_to_cpu(agf->agf_rmap_blocks) - 1)); + agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); + agf->agf_longest = cpu_to_be32(ra->longest); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | + XFS_AGF_LONGEST | + XFS_AGF_FREEBLKS); + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the allocbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi]; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi]; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected free space information to stage new free space btrees. + * If this is successful we'll return with the new btree root + * information logged to the repair transaction but not yet committed. + */ +STATIC int +xrep_abt_build_new_trees( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + struct xfs_perag *pag = sc->sa.pag; + bool needs_resort = false; + int error; + + /* + * Sort the free extents by length so that we can set up the free space + * btrees in as few extents as possible. This reduces the amount of + * deferred rmap / free work we have to do at the end. + */ + error = xrep_cntbt_sort_records(ra, false); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + xrep_newbt_init_bare(&ra->new_bnobt, sc); + xrep_newbt_init_bare(&ra->new_cntbt, sc); + + ra->new_bnobt.bload.get_records = xrep_abt_get_records; + ra->new_cntbt.bload.get_records = xrep_abt_get_records; + + ra->new_bnobt.bload.claim_block = xrep_abt_claim_block; + ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; + + /* Allocate cursors for the staged btrees. */ + bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake, + pag, XFS_BTNUM_BNO); + cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake, + pag, XFS_BTNUM_CNT); + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btrees. */ + error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); + if (error) + goto err_cur; + + /* + * If we need to re-sort the free extents by length, do so so that we + * can put the records into the cntbt in the correct order. + */ + if (needs_resort) { + error = xrep_cntbt_sort_records(ra, needs_resort); + if (error) + goto err_cur; + } + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the alternate incore btree + * height so that we don't trip the verifiers when writing the new + * btree blocks to disk. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = + ra->new_bnobt.bload.btree_height; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = + ra->new_cntbt.bload.btree_height; + + /* Load the free space by length tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + ra->longest = 0; + error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); + if (error) + goto err_levels; + + error = xrep_bnobt_sort_records(ra); + if (error) + return error; + + /* Load the free space by block number tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); + if (error) + goto err_levels; + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(bno_cur, 0); + xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(cnt_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_abt_reset_counters(ra); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + xrep_abt_dispose_reservations(ra, error); + + return xrep_roll_ag_trans(sc); + +err_levels: + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; +err_cur: + xfs_btree_del_cursor(cnt_cur, error); + xfs_btree_del_cursor(bno_cur, error); +err_newbt: + xrep_abt_dispose_reservations(ra, error); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_abt_remove_old_trees( + struct xrep_abt *ra) +{ + struct xfs_perag *pag = ra->sc->sa.pag; + int error; + + /* Free the old btree blocks if they're not in use. */ + error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks, + &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); + if (error) + return error; + + /* + * Now that we've zapped all the old allocbt blocks we can turn off + * the alternate height mechanism. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; + return 0; +} + +/* Repair the freespace btrees for some AG. */ +int +xrep_allocbt( + struct xfs_scrub *sc) +{ + struct xrep_abt *ra; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); + if (!ra) + return -ENOMEM; + ra->sc = sc; + + /* We rebuild both data structures. */ + sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. In theory we cleared this before we started, but + * let's not risk the filesystem. + */ + if (!xfs_extent_busy_list_empty(sc->sa.pag)) { + error = -EDEADLOCK; + goto out_ra; + } + + /* Set up enough storage to handle maximally fragmented free space. */ + descr = xchk_xfile_ag_descr(sc, "free space records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, + sizeof(struct xfs_alloc_rec_incore), + &ra->free_records); + kfree(descr); + if (error) + goto out_ra; + + /* Collect the free space data and find the old btree blocks. */ + xagb_bitmap_init(&ra->old_allocbt_blocks); + error = xrep_abt_find_freespace(ra); + if (error) + goto out_bitmap; + + /* Rebuild the free space information. */ + error = xrep_abt_build_new_trees(ra); + if (error) + goto out_bitmap; + + /* Kill the old trees. */ + error = xrep_abt_remove_old_trees(ra); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ra->old_allocbt_blocks); + xfarray_destroy(ra->free_records); +out_ra: + kfree(ra); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_allocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; + error = xchk_allocbt(sc); + if (error) + goto out; + + sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; + error = xchk_allocbt(sc); +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 6c16d9530cca..83c7feb38714 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -527,28 +527,23 @@ xchk_xattr_check_sf( struct xfs_scrub *sc) { struct xchk_xattr_buf *ab = sc->buf; - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; + struct xfs_ifork *ifp = &sc->ip->i_af; + struct xfs_attr_sf_hdr *sf = ifp->if_data; + struct xfs_attr_sf_entry *sfe = xfs_attr_sf_firstentry(sf); struct xfs_attr_sf_entry *next; - struct xfs_ifork *ifp; - unsigned char *end; + unsigned char *end = ifp->if_data + ifp->if_bytes; int i; int error = 0; - ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); - bitmap_zero(ab->usedmap, ifp->if_bytes); - sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data; - end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes; - xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr)); + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*sf)); - sfe = &sf->list[0]; if ((unsigned char *)sfe > end) { xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); return 0; } - for (i = 0; i < sf->hdr.count; i++) { + for (i = 0; i < sf->count; i++) { unsigned char *name = sfe->nameval; unsigned char *value = &sfe->nameval[sfe->namelen]; diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index e0c89a9a0ca0..1449bb5262d9 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -16,7 +16,9 @@ #include <linux/interval_tree_generic.h> -struct xbitmap_node { +/* u64 bitmap */ + +struct xbitmap64_node { struct rb_node bn_rbnode; /* First set bit of this interval and subtree. */ @@ -39,72 +41,72 @@ struct xbitmap_node { * forward-declare them anyway for clarity. */ static inline void -xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root); +xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root); static inline void -xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root); +xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline struct xbitmap_node * -xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start, +static inline struct xbitmap64_node * +xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start, uint64_t last); -static inline struct xbitmap_node * -xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start, +static inline struct xbitmap64_node * +xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start, uint64_t last); -INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t, - __bn_subtree_last, START, LAST, static inline, xbitmap_tree) +INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t, + __bn_subtree_last, START, LAST, static inline, xbitmap64_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ -#define for_each_xbitmap_extent(bn, bitmap) \ +#define for_each_xbitmap64_extent(bn, bitmap) \ for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ - struct xbitmap_node, bn_rbnode); \ + struct xbitmap64_node, bn_rbnode); \ (bn) != NULL; \ (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ - struct xbitmap_node, bn_rbnode)) + struct xbitmap64_node, bn_rbnode)) /* Clear a range of this bitmap. */ int -xbitmap_clear( - struct xbitmap *bitmap, +xbitmap64_clear( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *bn; - struct xbitmap_node *new_bn; + struct xbitmap64_node *bn; + struct xbitmap64_node *new_bn; uint64_t last = start + len - 1; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) { + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) { if (bn->bn_start < start && bn->bn_last > last) { uint64_t old_last = bn->bn_last; /* overlaps with the entire clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); /* add an extent */ - new_bn = kmalloc(sizeof(struct xbitmap_node), + new_bn = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!new_bn) return -ENOMEM; new_bn->bn_start = last + 1; new_bn->bn_last = old_last; - xbitmap_tree_insert(new_bn, &bitmap->xb_root); + xbitmap64_tree_insert(new_bn, &bitmap->xb_root); } else if (bn->bn_start < start) { /* overlaps with the left side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); } else if (bn->bn_last > last) { /* overlaps with the right side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_start = last + 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); break; } else { /* in the middle of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } @@ -114,59 +116,59 @@ xbitmap_clear( /* Set a range of this bitmap. */ int -xbitmap_set( - struct xbitmap *bitmap, +xbitmap64_set( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *left; - struct xbitmap_node *right; + struct xbitmap64_node *left; + struct xbitmap64_node *right; uint64_t last = start + len - 1; int error; /* Is this whole range already set? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); if (left && left->bn_start <= start && left->bn_last >= last) return 0; /* Clear out everything in the range we want to set. */ - error = xbitmap_clear(bitmap, start, len); + error = xbitmap64_clear(bitmap, start, len); if (error) return error; /* Do we have a left-adjacent extent? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); ASSERT(!left || left->bn_last + 1 == start); /* Do we have a right-adjacent extent? */ - right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); ASSERT(!right || right->bn_start == last + 1); if (left && right) { /* combine left and right adjacent extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); left->bn_last = right->bn_last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); kfree(right); } else if (left) { /* combine with left extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } else if (right) { /* combine with right extent */ - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); right->bn_start = start; - xbitmap_tree_insert(right, &bitmap->xb_root); + xbitmap64_tree_insert(right, &bitmap->xb_root); } else { /* add an extent */ - left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS); + left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!left) return -ENOMEM; left->bn_start = start; left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } return 0; @@ -174,21 +176,21 @@ xbitmap_set( /* Free everything related to this bitmap. */ void -xbitmap_destroy( - struct xbitmap *bitmap) +xbitmap64_destroy( + struct xbitmap64 *bitmap) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { - xbitmap_tree_remove(bn, &bitmap->xb_root); + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } /* Set up a per-AG block bitmap. */ void -xbitmap_init( - struct xbitmap *bitmap) +xbitmap64_init( + struct xbitmap64 *bitmap) { bitmap->xb_root = RB_ROOT_CACHED; } @@ -208,18 +210,18 @@ xbitmap_init( * This is the logical equivalent of bitmap &= ~sub. */ int -xbitmap_disunion( - struct xbitmap *bitmap, - struct xbitmap *sub) +xbitmap64_disunion( + struct xbitmap64 *bitmap, + struct xbitmap64 *sub) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; int error; - if (xbitmap_empty(bitmap) || xbitmap_empty(sub)) + if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub)) return 0; - for_each_xbitmap_extent(bn, sub) { - error = xbitmap_clear(bitmap, bn->bn_start, + for_each_xbitmap64_extent(bn, sub) { + error = xbitmap64_clear(bitmap, bn->bn_start, bn->bn_last - bn->bn_start + 1); if (error) return error; @@ -228,88 +230,273 @@ xbitmap_disunion( return 0; } +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap64_hweight( + struct xbitmap64 *bitmap) +{ + struct xbitmap64_node *bn; + uint64_t ret = 0; + + for_each_xbitmap64_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; + + return ret; +} + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap64_walk( + struct xbitmap64 *bitmap, + xbitmap64_walk_fn fn, + void *priv) +{ + struct xbitmap64_node *bn; + int error = 0; + + for_each_xbitmap64_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap64_empty( + struct xbitmap64 *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap64_test( + struct xbitmap64 *bitmap, + uint64_t start, + uint64_t *len) +{ + struct xbitmap64_node *bn; + uint64_t last = start + *len - 1; + + bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} + +/* u32 bitmap */ + +struct xbitmap32_node { + struct rb_node bn_rbnode; + + /* First set bit of this interval and subtree. */ + uint32_t bn_start; + + /* Last set bit of this interval. */ + uint32_t bn_last; + + /* Last set bit of this subtree. Do not touch this. */ + uint32_t __bn_subtree_last; +}; + +/* Define our own interval tree type with uint32_t parameters. */ + /* - * Record all btree blocks seen while iterating all records of a btree. - * - * We know that the btree query_all function starts at the left edge and walks - * towards the right edge of the tree. Therefore, we know that we can walk up - * the btree cursor towards the root; if the pointer for a given level points - * to the first record/key in that block, we haven't seen this block before; - * and therefore we need to remember that we saw this block in the btree. - * - * So if our btree is: - * - * 4 - * / | \ - * 1 2 3 - * - * Pretend for this example that each leaf block has 100 btree records. For - * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we - * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so - * we record block 4. The list is [1, 4]. - * - * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit - * the loop. The list remains [1, 4]. - * - * For the 101st btree record, we've moved onto leaf block 2. Now - * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that - * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. - * - * For the 102nd record, bc_levels[0].ptr == 2, so we continue. - * - * For the 201st record, we've moved on to leaf block 3. - * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. - * - * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll + * forward-declare them anyway for clarity. */ +static inline void +xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root); -/* Mark a btree block to the agblock bitmap. */ -STATIC int -xagb_bitmap_visit_btblock( - struct xfs_btree_cur *cur, - int level, - void *priv) +static inline void +xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root); + +static inline struct xbitmap32_node * +xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start, + uint32_t last); + +static inline struct xbitmap32_node * +xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start, + uint32_t last); + +INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t, + __bn_subtree_last, START, LAST, static inline, xbitmap32_tree) + +/* Iterate each interval of a bitmap. Do not change the bitmap. */ +#define for_each_xbitmap32_extent(bn, bitmap) \ + for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ + struct xbitmap32_node, bn_rbnode); \ + (bn) != NULL; \ + (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ + struct xbitmap32_node, bn_rbnode)) + +/* Clear a range of this bitmap. */ +int +xbitmap32_clear( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) { - struct xagb_bitmap *bitmap = priv; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - xfs_agblock_t agbno; + struct xbitmap32_node *bn; + struct xbitmap32_node *new_bn; + uint32_t last = start + len - 1; - xfs_btree_get_block(cur, level, &bp); - if (!bp) - return 0; + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) { + if (bn->bn_start < start && bn->bn_last > last) { + uint32_t old_last = bn->bn_last; - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + /* overlaps with the entire clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); - return xagb_bitmap_set(bitmap, agbno, 1); + /* add an extent */ + new_bn = kmalloc(sizeof(struct xbitmap32_node), + XCHK_GFP_FLAGS); + if (!new_bn) + return -ENOMEM; + new_bn->bn_start = last + 1; + new_bn->bn_last = old_last; + xbitmap32_tree_insert(new_bn, &bitmap->xb_root); + } else if (bn->bn_start < start) { + /* overlaps with the left side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + } else if (bn->bn_last > last) { + /* overlaps with the right side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_start = last + 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + break; + } else { + /* in the middle of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } + } + + return 0; } -/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +/* Set a range of this bitmap. */ int -xagb_bitmap_set_btblocks( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) +xbitmap32_set( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) { - return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, - XFS_BTREE_VISIT_ALL, bitmap); + struct xbitmap32_node *left; + struct xbitmap32_node *right; + uint32_t last = start + len - 1; + int error; + + /* Is this whole range already set? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); + if (left && left->bn_start <= start && left->bn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + error = xbitmap32_clear(bitmap, start, len); + if (error) + return error; + + /* Do we have a left-adjacent extent? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + ASSERT(!left || left->bn_last + 1 == start); + + /* Do we have a right-adjacent extent? */ + right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + ASSERT(!right || right->bn_start == last + 1); + + if (left && right) { + /* combine left and right adjacent extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + xbitmap32_tree_remove(right, &bitmap->xb_root); + left->bn_last = right->bn_last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + kfree(right); + } else if (left) { + /* combine with left extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } else if (right) { + /* combine with right extent */ + xbitmap32_tree_remove(right, &bitmap->xb_root); + right->bn_start = start; + xbitmap32_tree_insert(right, &bitmap->xb_root); + } else { + /* add an extent */ + left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS); + if (!left) + return -ENOMEM; + left->bn_start = start; + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } + + return 0; +} + +/* Free everything related to this bitmap. */ +void +xbitmap32_destroy( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) { + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } +} + +/* Set up a per-AG block bitmap. */ +void +xbitmap32_init( + struct xbitmap32 *bitmap) +{ + bitmap->xb_root = RB_ROOT_CACHED; } /* - * Record all the buffers pointed to by the btree cursor. Callers already - * engaged in a btree walk should call this function to capture the list of - * blocks going from the leaf towards the root. + * Remove all the blocks mentioned in @sub from the extents in @bitmap. + * + * The intent is that callers will iterate the rmapbt for all of its records + * for a given owner to generate @bitmap; and iterate all the blocks of the + * metadata structures that are not being rebuilt and have the same rmapbt + * owner to generate @sub. This routine subtracts all the extents + * mentioned in sub from all the extents linked in @bitmap, which leaves + * @bitmap as the list of blocks that are not accounted for, which we assume + * are the dead blocks of the old metadata structure. The blocks mentioned in + * @bitmap can be reaped. + * + * This is the logical equivalent of bitmap &= ~sub. */ int -xagb_bitmap_set_btcur_path( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) +xbitmap32_disunion( + struct xbitmap32 *bitmap, + struct xbitmap32 *sub) { - int i; + struct xbitmap32_node *bn; int error; - for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { - error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub)) + return 0; + + for_each_xbitmap32_extent(bn, sub) { + error = xbitmap32_clear(bitmap, bn->bn_start, + bn->bn_last - bn->bn_start + 1); if (error) return error; } @@ -318,14 +505,14 @@ xagb_bitmap_set_btcur_path( } /* How many bits are set in this bitmap? */ -uint64_t -xbitmap_hweight( - struct xbitmap *bitmap) +uint32_t +xbitmap32_hweight( + struct xbitmap32 *bitmap) { - struct xbitmap_node *bn; - uint64_t ret = 0; + struct xbitmap32_node *bn; + uint32_t ret = 0; - for_each_xbitmap_extent(bn, bitmap) + for_each_xbitmap32_extent(bn, bitmap) ret += bn->bn_last - bn->bn_start + 1; return ret; @@ -333,15 +520,15 @@ xbitmap_hweight( /* Call a function for every run of set bits in this bitmap. */ int -xbitmap_walk( - struct xbitmap *bitmap, - xbitmap_walk_fn fn, +xbitmap32_walk( + struct xbitmap32 *bitmap, + xbitmap32_walk_fn fn, void *priv) { - struct xbitmap_node *bn; + struct xbitmap32_node *bn; int error = 0; - for_each_xbitmap_extent(bn, bitmap) { + for_each_xbitmap32_extent(bn, bitmap) { error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); if (error) break; @@ -352,23 +539,23 @@ xbitmap_walk( /* Does this bitmap have no bits set at all? */ bool -xbitmap_empty( - struct xbitmap *bitmap) +xbitmap32_empty( + struct xbitmap32 *bitmap) { return bitmap->xb_root.rb_root.rb_node == NULL; } /* Is the start of the range set or clear? And for how long? */ bool -xbitmap_test( - struct xbitmap *bitmap, - uint64_t start, - uint64_t *len) +xbitmap32_test( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t *len) { - struct xbitmap_node *bn; - uint64_t last = start + *len - 1; + struct xbitmap32_node *bn; + uint32_t last = start + *len - 1; - bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); if (!bn) return false; if (bn->bn_start <= start) { diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 4fe58bad6734..2df8911606d6 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -6,17 +6,19 @@ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xbitmap { +/* u64 bitmap */ + +struct xbitmap64 { struct rb_root_cached xb_root; }; -void xbitmap_init(struct xbitmap *bitmap); -void xbitmap_destroy(struct xbitmap *bitmap); +void xbitmap64_init(struct xbitmap64 *bitmap); +void xbitmap64_destroy(struct xbitmap64 *bitmap); -int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); -uint64_t xbitmap_hweight(struct xbitmap *bitmap); +int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub); +uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap); /* * Return codes for the bitmap iterator functions are 0 to continue iterating, @@ -25,84 +27,39 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap); * iteration, because neither bitmap iterator ever generates that error code on * its own. Callers must not modify the bitmap while walking it. */ -typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); -int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, +typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv); +int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn, void *priv); -bool xbitmap_empty(struct xbitmap *bitmap); -bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); +bool xbitmap64_empty(struct xbitmap64 *bitmap); +bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len); -/* Bitmaps, but for type-checked for xfs_agblock_t */ +/* u32 bitmap */ -struct xagb_bitmap { - struct xbitmap agbitmap; +struct xbitmap32 { + struct rb_root_cached xb_root; }; -static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) -{ - xbitmap_init(&bitmap->agbitmap); -} - -static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) -{ - xbitmap_destroy(&bitmap->agbitmap); -} - -static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap_clear(&bitmap->agbitmap, start, len); -} -static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap_set(&bitmap->agbitmap, start, len); -} - -static inline bool -xagb_bitmap_test( - struct xagb_bitmap *bitmap, - xfs_agblock_t start, - xfs_extlen_t *len) -{ - uint64_t biglen = *len; - bool ret; - - ret = xbitmap_test(&bitmap->agbitmap, start, &biglen); - - if (start + biglen >= UINT_MAX) { - ASSERT(0); - biglen = UINT_MAX - start; - } - - *len = biglen; - return ret; -} - -static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, - struct xagb_bitmap *sub) -{ - return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap); -} +void xbitmap32_init(struct xbitmap32 *bitmap); +void xbitmap32_destroy(struct xbitmap32 *bitmap); -static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) -{ - return xbitmap_hweight(&bitmap->agbitmap); -} -static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) -{ - return xbitmap_empty(&bitmap->agbitmap); -} +int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub); +uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap); -static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, - xbitmap_walk_fn fn, void *priv) -{ - return xbitmap_walk(&bitmap->agbitmap, fn, priv); -} +/* + * Return codes for the bitmap iterator functions are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iteration caller. The special value -ECANCELED can be used to stop + * iteration, because neither bitmap iterator ever generates that error code on + * its own. Callers must not modify the bitmap while walking it. + */ +typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv); +int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn, + void *priv); -int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); -int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); +bool xbitmap32_empty(struct xbitmap32 *bitmap); +bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 06d8c1996a33..b169cddde6da 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -19,9 +19,11 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "scrub/health.h" #include "xfs_ag.h" /* Set us up with an inode's bmap. */ @@ -48,9 +50,18 @@ xchk_setup_inode_bmap( if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + bool is_repair = xchk_could_repair(sc); xchk_ilock(sc, XFS_MMAPLOCK_EXCL); + /* Break all our leases, we're going to mess with things. */ + if (is_repair) { + error = xfs_break_layouts(VFS_I(sc->ip), + &sc->ilock_flags, BREAK_WRITE); + if (error) + goto out; + } + inode_dio_wait(VFS_I(sc->ip)); /* @@ -71,6 +82,15 @@ xchk_setup_inode_bmap( error = filemap_fdatawait_keep_errors(mapping); if (error && (error != -ENOSPC && error != -EIO)) goto out; + + /* Drop the page cache if we're repairing block mappings. */ + if (is_repair) { + error = invalidate_inode_pages2( + VFS_I(sc->ip)->i_mapping); + if (error) + goto out; + } + } /* Got the inode, lock it and we're ready to go. */ @@ -78,6 +98,10 @@ xchk_setup_inode_bmap( if (error) goto out; + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode */ @@ -633,6 +657,82 @@ xchk_bmap_check_ag_rmaps( } /* + * Decide if we want to scan the reverse mappings to determine if the attr + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_attrfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_af; + + /* + * If the dinode repair found a bad attr fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * Files can have an attr fork in EXTENTS format with zero records for + * several reasons: + * + * a) an attr set created a fork but ran out of space + * b) attr replace deleted an old attr but failed during the set step + * c) the data fork was in btree format when all attrs were deleted, so + * the fork was left in place + * d) the inode repair code zapped the fork + * + * Only in case (d) do we want to scan the rmapbt to see if we need to + * rebuild the attr fork. The fork zap code clears all DAC permission + * bits and zeroes the uid and gid, so avoid the scan if any of those + * three conditions are not met. + */ + if ((VFS_I(ip)->i_mode & 0777) != 0) + return false; + if (!uid_eq(VFS_I(ip)->i_uid, GLOBAL_ROOT_UID)) + return false; + if (!gid_eq(VFS_I(ip)->i_gid, GLOBAL_ROOT_GID)) + return false; + + return true; +} + +/* + * Decide if we want to scan the reverse mappings to determine if the data + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_datafork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_df; + + /* Don't support realtime rmap checks yet. */ + if (XFS_IS_REALTIME_INODE(ip)) + return false; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * If we encounter an empty data fork along with evidence that the fork + * might not really be empty, we need to scan the reverse mappings to + * decide if we're going to rebuild the fork. Data forks with nonzero + * file size are scanned. + */ + return i_size_read(VFS_I(ip)) != 0; +} + +/* * Decide if we want to walk every rmap btree in the fs to make sure that each * rmap for this file fork has corresponding bmbt entries. */ @@ -641,7 +741,6 @@ xchk_bmap_want_check_rmaps( struct xchk_bmap_info *info) { struct xfs_scrub *sc = info->sc; - struct xfs_ifork *ifp; if (!xfs_has_rmapbt(sc->mp)) return false; @@ -650,28 +749,10 @@ xchk_bmap_want_check_rmaps( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return false; - /* Don't support realtime rmap checks yet. */ - if (info->is_rt) - return false; - - /* - * The inode repair code zaps broken inode forks by resetting them back - * to EXTENTS format and zero extent records. If we encounter a fork - * in this state along with evidence that the fork isn't supposed to be - * empty, we need to scan the reverse mappings to decide if we're going - * to rebuild the fork. Data forks with nonzero file size are scanned. - * xattr forks are never empty of content, so they are always scanned. - */ - ifp = xfs_ifork_ptr(sc->ip, info->whichfork); - if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) { - if (info->whichfork == XFS_DATA_FORK && - i_size_read(VFS_I(sc->ip)) == 0) - return false; - - return true; - } + if (info->whichfork == XFS_ATTR_FORK) + return xchk_bmap_check_empty_attrfork(sc->ip); - return false; + return xchk_bmap_check_empty_datafork(sc->ip); } /* Make sure each rmap has a corresponding bmbt entry. */ @@ -939,7 +1020,20 @@ int xchk_bmap_data( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_DATA_FORK); + int error; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTD_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_DATA_FORK); + if (error) + return error; + + /* If the data fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTD_ZAPPED); + return 0; } /* Scrub an inode's attr fork. */ @@ -947,7 +1041,27 @@ int xchk_bmap_attr( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_ATTR_FORK); + int error; + + /* + * If the attr fork has been zapped, it's possible that forkoff was + * reset to zero and hence sc->ip->i_afp is NULL. We don't want the + * NULL ifp check in xchk_bmap to conclude that the attr fork is ok, + * so short circuit that logic by setting the corruption flag and + * returning immediately. + */ + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTA_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_ATTR_FORK); + if (error) + return error; + + /* If the attr fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTA_ZAPPED); + return 0; } /* Scrub an inode's CoW fork. */ diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c new file mode 100644 index 000000000000..a4bb89fdd510 --- /dev/null +++ b/fs/xfs/scrub/bmap_repair.c @@ -0,0 +1,867 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_reflink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Fork Block Mapping (BMBT) Repair + * ====================================== + * + * Gather all the rmap records for the inode and fork we're fixing, reset the + * incore fork, then recreate the btree. + */ + +enum reflink_scan_state { + RLS_IRRELEVANT = -1, /* not applicable to this file */ + RLS_UNKNOWN, /* shared extent scans required */ + RLS_SET_IFLAG, /* iflag must be set */ +}; + +struct xrep_bmap { + /* Old bmbt blocks */ + struct xfsb_bitmap old_bmbt_blocks; + + /* New fork. */ + struct xrep_newbt new_bmapbt; + + /* List of new bmap records. */ + struct xfarray *bmap_records; + + struct xfs_scrub *sc; + + /* How many blocks did we find allocated to this file? */ + xfs_rfsblock_t nblocks; + + /* How many bmbt blocks did we find for this fork? */ + xfs_rfsblock_t old_bmbt_block_count; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* How many real (non-hole, non-delalloc) mappings do we have? */ + uint64_t real_mappings; + + /* Which fork are we fixing? */ + int whichfork; + + /* What d the REFLINK flag be set when the repair is over? */ + enum reflink_scan_state reflink_scan; + + /* Do we allow unwritten extents? */ + bool allow_unwritten; +}; + +/* Is this space extent shared? Flag the inode if it is. */ +STATIC int +xrep_bmap_discover_shared( + struct xrep_bmap *rb, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + struct xfs_scrub *sc = rb->sc; + xfs_agblock_t agbno; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error; + + agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); + error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount, + &fbno, &flen, false); + if (error) + return error; + + if (fbno != NULLAGBLOCK) + rb->reflink_scan = RLS_SET_IFLAG; + + return 0; +} + +/* Remember this reverse-mapping as a series of bmap records. */ +STATIC int +xrep_bmap_from_rmap( + struct xrep_bmap *rb, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + bool unwritten) +{ + struct xfs_bmbt_irec irec = { + .br_startoff = startoff, + .br_startblock = startblock, + .br_state = unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM, + }; + struct xfs_bmbt_rec rbe; + struct xfs_scrub *sc = rb->sc; + int error = 0; + + /* + * If we're repairing the data fork of a non-reflinked regular file on + * a reflink filesystem, we need to figure out if this space extent is + * shared. + */ + if (rb->reflink_scan == RLS_UNKNOWN && !unwritten) { + error = xrep_bmap_discover_shared(rb, startblock, blockcount); + if (error) + return error; + } + + do { + xfs_failaddr_t fa; + + irec.br_blockcount = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + fa = xfs_bmap_validate_extent(sc->ip, rb->whichfork, &irec); + if (fa) + return -EFSCORRUPTED; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(sc->ip, rb->whichfork, &irec); + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + + rb->real_mappings++; + + irec.br_startblock += irec.br_blockcount; + irec.br_startoff += irec.br_blockcount; + blockcount -= irec.br_blockcount; + } while (blockcount > 0); + + return 0; +} + +/* Check for any obvious errors or conflicts in the file mapping. */ +STATIC int +xrep_bmap_check_fork_rmap( + struct xrep_bmap *rb, + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec) +{ + struct xfs_scrub *sc = rb->sc; + enum xbtree_recpacking outcome; + int error; + + /* + * Data extents for rt files are never stored on the data device, but + * everything else (xattrs, bmbt blocks) can be. + */ + if (XFS_IS_REALTIME_INODE(sc->ip) && + !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) + return -EFSCORRUPTED; + + /* Check that this is within the AG. */ + if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Check the file offset range. */ + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* No contradictory flags. */ + if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) && + (rec->rm_flags & XFS_RMAP_UNWRITTEN)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rm_startblock, rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record extents that belong to this inode's fork. */ +STATIC int +xrep_bmap_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_bmap *rb = priv; + struct xfs_mount *mp = cur->bc_mp; + xfs_fsblock_t fsbno; + int error = 0; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + if (rec->rm_owner != rb->sc->ip->i_ino) + return 0; + + error = xrep_bmap_check_fork_rmap(rb, cur, rec); + if (error) + return error; + + /* + * Record all blocks allocated to this file even if the extent isn't + * for the fork we're rebuilding so that we can reset di_nblocks later. + */ + rb->nblocks += rec->rm_blockcount; + + /* If this rmap isn't for the fork we want, we're done. */ + if (rb->whichfork == XFS_DATA_FORK && + (rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rb->whichfork == XFS_ATTR_FORK && + !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + + /* Reject unwritten extents if we don't allow those. */ + if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten) + return -EFSCORRUPTED; + + fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, + rec->rm_startblock); + + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { + rb->old_bmbt_block_count += rec->rm_blockcount; + return xfsb_bitmap_set(&rb->old_bmbt_blocks, fsbno, + rec->rm_blockcount); + } + + return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno, + rec->rm_blockcount, + rec->rm_flags & XFS_RMAP_UNWRITTEN); +} + +/* + * Compare two block mapping records. We want to sort in order of increasing + * file offset. + */ +static int +xrep_bmap_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_bmbt_rec *ba = a; + const struct xfs_bmbt_rec *bb = b; + xfs_fileoff_t ao = xfs_bmbt_disk_get_startoff(ba); + xfs_fileoff_t bo = xfs_bmbt_disk_get_startoff(bb); + + if (ao > bo) + return 1; + else if (ao < bo) + return -1; + return 0; +} + +/* + * Sort the bmap extents by fork offset or else the records will be in the + * wrong order. Ensure there are no overlaps in the file offset ranges. + */ +STATIC int +xrep_bmap_sort_records( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + xfs_fileoff_t next_off = 0; + xfarray_idx_t array_cur; + int error; + + error = xfarray_sort(rb->bmap_records, xrep_bmap_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + if (irec.br_startoff < next_off) + return -EFSCORRUPTED; + + next_off = irec.br_startoff + irec.br_blockcount; + } + + return 0; +} + +/* Scan one AG for reverse mappings that we can turn into extent maps. */ +STATIC int +xrep_bmap_scan_ag( + struct xrep_bmap *rb, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_bmap_walk_rmap, rb); + xchk_ag_free(sc, &sc->sa); + return error; +} + +/* Find the delalloc extents from the old incore extent tree. */ +STATIC int +xrep_bmap_find_delalloc( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + struct xfs_iext_cursor icur; + struct xfs_bmbt_rec rbe; + struct xfs_inode *ip = rb->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, rb->whichfork); + int error = 0; + + /* + * Skip this scan if we don't expect to find delayed allocation + * reservations in this fork. + */ + if (rb->whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0) + return 0; + + for_each_xfs_iext(ifp, &icur, &irec) { + if (!isnullstartblock(irec.br_startblock)) + continue; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(ip, rb->whichfork, &irec); + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + } + + return 0; +} + +/* + * Collect block mappings for this fork of this inode and decide if we have + * enough space to rebuild. Caller is responsible for cleaning up the list if + * anything goes wrong. + */ +STATIC int +xrep_bmap_find_mappings( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error = 0; + + /* Iterate the rmaps for extents. */ + for_each_perag(sc->mp, agno, pag) { + error = xrep_bmap_scan_ag(rb, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + return xrep_bmap_find_delalloc(rb); +} + +/* Retrieve real extent mappings for bulk loading the bmap btree. */ +STATIC int +xrep_bmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_bmbt_rec rec; + struct xfs_bmbt_irec *irec = &cur->bc_rec.b; + struct xrep_bmap *rb = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(rb->bmap_records, rb->array_cur++, + &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, irec); + } while (isnullstartblock(irec->br_startblock)); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_bmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_bmap *rb = priv; + + return xrep_newbt_claim_block(cur, &rb->new_bmapbt, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_bmap_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + ASSERT(level > 0); + + return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level); +} + +/* Update the inode counters. */ +STATIC int +xrep_bmap_reset_counters( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int64_t delta; + + if (rb->reflink_scan == RLS_SET_IFLAG) + sc->ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; + + /* + * Update the inode block counts to reflect the extents we found in the + * rmapbt. + */ + delta = ifake->if_blocks - rb->old_bmbt_block_count; + sc->ip->i_nblocks = rb->nblocks + delta; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + + /* + * Adjust the quota counts by the difference in size between the old + * and new bmbt. + */ + xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta); + return 0; +} + +/* + * Create a new iext tree and load it with block mappings. If the inode is + * in extents format, that's all we need to do to commit the new mappings. + * If it is in btree format, this takes care of preloading the incore tree. + */ +STATIC int +xrep_bmap_extents_load( + struct xrep_bmap *rb) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp = rb->new_bmapbt.ifake.if_fork; + xfarray_idx_t array_cur; + int error; + + ASSERT(ifp->if_bytes == 0); + + /* Add all the mappings (incl. delalloc) to the incore extent tree. */ + xfs_iext_first(ifp, &icur); + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + xfs_iext_insert_raw(ifp, &icur, &irec); + if (!isnullstartblock(irec.br_startblock)) + ifp->if_nextents++; + + xfs_iext_next(ifp, &icur); + } + + return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork, + ifp->if_nextents); +} + +/* + * Reserve new btree blocks, bulk load the bmap records into the ondisk btree, + * and load the incore extent tree. + */ +STATIC int +xrep_bmap_btree_load( + struct xrep_bmap *rb, + struct xfs_btree_cur *bmap_cur) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(bmap_cur, + &rb->new_bmapbt.bload, rb->real_mappings); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Guess how many blocks we're going to need to rebuild an entire bmap + * from the number of extents we found, and pump up our transaction to + * have sufficient block reservation. We're allowed to exceed file + * quota to repair inconsistent metadata. + */ + error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, + rb->new_bmapbt.bload.nr_blocks, 0, true); + if (error) + return error; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rb->new_bmapbt, + rb->new_bmapbt.bload.nr_blocks); + if (error) + return error; + + /* Add all observed bmap records. */ + rb->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bmap_cur, &rb->new_bmapbt.bload, rb); + if (error) + return error; + + /* + * Load the new bmap records into the new incore extent tree to + * preserve delalloc reservations for regular files. The directory + * code loads the extent tree during xfs_dir_open and assumes + * thereafter that it remains loaded, so we must not violate that + * assumption. + */ + return xrep_bmap_extents_load(rb); +} + +/* + * Use the collected bmap information to stage a new bmap fork. If this is + * successful we'll return with the new fork information logged to the repair + * transaction but not yet committed. The caller must ensure that the inode + * is joined to the transaction; the inode will be joined to a clean + * transaction when the function returns. + */ +STATIC int +xrep_bmap_build_new_fork( + struct xrep_bmap *rb) +{ + struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rb->sc; + struct xfs_btree_cur *bmap_cur; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int error; + + error = xrep_bmap_sort_records(rb); + if (error) + return error; + + /* + * Prepare to construct the new fork by initializing the new btree + * structure and creating a fake ifork in the ifakeroot structure. + */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + error = xrep_newbt_init_inode(&rb->new_bmapbt, sc, rb->whichfork, + &oinfo); + if (error) + return error; + + rb->new_bmapbt.bload.get_records = xrep_bmap_get_records; + rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block; + rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size; + bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake); + + /* + * Figure out the size and format of the new fork, then fill it with + * all the bmap records we've found. Join the inode to the transaction + * so that we can roll the transaction while holding the inode locked. + */ + if (rb->real_mappings <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) { + ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS; + error = xrep_bmap_extents_load(rb); + } else { + ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE; + error = xrep_bmap_btree_load(rb, bmap_cur); + } + if (error) + goto err_cur; + + /* + * Install the new fork in the inode. After this point the old mapping + * data are no longer accessible and the new tree is live. We delete + * the cursor immediately after committing the staged root because the + * staged fork might be in extents format. + */ + xfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork); + xfs_btree_del_cursor(bmap_cur, 0); + + /* Reset the inode counters now that we've changed the fork. */ + error = xrep_bmap_reset_counters(rb); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rb->new_bmapbt); + if (error) + return error; + + return xrep_roll_trans(sc); + +err_cur: + if (bmap_cur) + xfs_btree_del_cursor(bmap_cur, error); +err_newbt: + xrep_newbt_cancel(&rb->new_bmapbt); + return error; +} + +/* + * Now that we've logged the new inode btree, invalidate all of the old blocks + * and free them, if there were any. + */ +STATIC int +xrep_bmap_remove_old_tree( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_owner_info oinfo; + + /* Free the old bmbt blocks if they're not in use. */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + return xrep_reap_fsblocks(sc, &rb->old_bmbt_blocks, &oinfo); +} + +/* Check for garbage inputs. Returns -ECANCELED if there's nothing to do. */ +STATIC int +xrep_bmap_check_inputs( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); + + ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK); + + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + + /* No fork means nothing to rebuild. */ + if (!ifp) + return -ECANCELED; + + /* + * We only know how to repair extent mappings, which is to say that we + * only support extents and btree fork format. Repairs to a local + * format fork require a higher level repair function, so we do not + * have any work to do here. + */ + switch (ifp->if_format) { + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + return -ECANCELED; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return -EFSCORRUPTED; + } + + if (whichfork == XFS_ATTR_FORK) + return 0; + + /* Only files, symlinks, and directories get to have data forks. */ + switch (VFS_I(sc->ip)->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + /* ok */ + break; + default: + return -EINVAL; + } + + /* Don't know how to rebuild realtime data forks. */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + return 0; +} + +/* Set up the initial state of the reflink scan. */ +static inline enum reflink_scan_state +xrep_bmap_init_reflink_scan( + struct xfs_scrub *sc, + int whichfork) +{ + /* cannot share on non-reflink filesystem */ + if (!xfs_has_reflink(sc->mp)) + return RLS_IRRELEVANT; + + /* preserve flag if it's already set */ + if (xfs_is_reflink_inode(sc->ip)) + return RLS_SET_IFLAG; + + /* can only share regular files */ + if (!S_ISREG(VFS_I(sc->ip)->i_mode)) + return RLS_IRRELEVANT; + + /* cannot share attr fork extents */ + if (whichfork != XFS_DATA_FORK) + return RLS_IRRELEVANT; + + /* cannot share realtime extents */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return RLS_IRRELEVANT; + + return RLS_UNKNOWN; +} + +/* Repair an inode fork. */ +int +xrep_bmap( + struct xfs_scrub *sc, + int whichfork, + bool allow_unwritten) +{ + struct xrep_bmap *rb; + char *descr; + unsigned int max_bmbt_recs; + bool large_extcount; + int error = 0; + + error = xrep_bmap_check_inputs(sc, whichfork); + if (error == -ECANCELED) + return 0; + if (error) + return error; + + rb = kzalloc(sizeof(struct xrep_bmap), XCHK_GFP_FLAGS); + if (!rb) + return -ENOMEM; + rb->sc = sc; + rb->whichfork = whichfork; + rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork); + rb->allow_unwritten = allow_unwritten; + + /* Set up enough storage to handle the max records for this fork. */ + large_extcount = xfs_has_large_extent_counts(sc->mp); + max_bmbt_recs = xfs_iext_max_nextents(large_extcount, whichfork); + descr = xchk_xfile_ino_descr(sc, "%s fork mapping records", + whichfork == XFS_DATA_FORK ? "data" : "attr"); + error = xfarray_create(descr, max_bmbt_recs, + sizeof(struct xfs_bmbt_rec), &rb->bmap_records); + kfree(descr); + if (error) + goto out_rb; + + /* Collect all reverse mappings for this fork's extents. */ + xfsb_bitmap_init(&rb->old_bmbt_blocks); + error = xrep_bmap_find_mappings(rb); + if (error) + goto out_bitmap; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the bmap information. */ + error = xrep_bmap_build_new_fork(rb); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_bmap_remove_old_tree(rb); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&rb->old_bmbt_blocks); + xfarray_destroy(rb->bmap_records); +out_rb: + kfree(rb); + return error; +} + +/* Repair an inode's data fork. */ +int +xrep_bmap_data( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_DATA_FORK, true); +} + +/* Repair an inode's attr fork. */ +int +xrep_bmap_attr( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_ATTR_FORK, false); +} diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index de24532fe083..81f2b96bb5a7 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -25,6 +25,7 @@ #include "xfs_trans_priv.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" @@ -604,6 +605,7 @@ xchk_ag_free( struct xchk_ag *sa) { xchk_ag_btcur_free(sa); + xrep_reset_perag_resv(sc); if (sa->agf_bp) { xfs_trans_brelse(sc->tp, sa->agf_bp); sa->agf_bp = NULL; @@ -733,6 +735,8 @@ xchk_iget( xfs_ino_t inum, struct xfs_inode **ipp) { + ASSERT(sc->tp != NULL); + return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); } @@ -816,6 +820,26 @@ again: return 0; } +#ifdef CONFIG_XFS_QUOTA +/* + * Try to attach dquots to this inode if we think we might want to repair it. + * Callers must not hold any ILOCKs. If the dquots are broken and cannot be + * attached, a quotacheck will be scheduled. + */ +int +xchk_ino_dqattach( + struct xfs_scrub *sc) +{ + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + if (!xchk_could_repair(sc)) + return 0; + + return xrep_ino_dqattach(sc); +} +#endif + /* Install an inode that we opened by handle for scrubbing. */ int xchk_install_handle_inode( @@ -882,8 +906,8 @@ xchk_iget_for_scrubbing( if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; - /* Try a regular untrusted iget. */ - error = xchk_iget(sc, sc->sm->sm_ino, &ip); + /* Try a safe untrusted iget. */ + error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); if (!error) return xchk_install_handle_inode(sc, ip); if (error == -ENOENT) @@ -1027,6 +1051,11 @@ xchk_setup_inode_contents( error = xchk_trans_alloc(sc, resblks); if (error) goto out; + + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode for us */ @@ -1132,6 +1161,7 @@ xchk_metadata_inode_subtype( unsigned int scrub_type) { __u32 smtype = sc->sm->sm_type; + unsigned int sick_mask = sc->sick_mask; int error; sc->sm->sm_type = scrub_type; @@ -1149,6 +1179,7 @@ xchk_metadata_inode_subtype( break; } + sc->sick_mask = sick_mask; sc->sm->sm_type = smtype; return error; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index cabdc0e16838..da09580b454a 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -103,9 +103,15 @@ xchk_setup_rtsummary(struct xfs_scrub *sc) } #endif #ifdef CONFIG_XFS_QUOTA +int xchk_ino_dqattach(struct xfs_scrub *sc); int xchk_setup_quota(struct xfs_scrub *sc); #else static inline int +xchk_ino_dqattach(struct xfs_scrub *sc) +{ + return 0; +} +static inline int xchk_setup_quota(struct xfs_scrub *sc) { return -ENOENT; @@ -151,6 +157,11 @@ void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); +/* + * Grab the inode at @inum. The caller must have created a scrub transaction + * so that we can confirm the inumber by walking the inobt and not deadlock on + * a loop in the inobt. + */ int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_buf **agi_bpp, struct xfs_inode **ipp); @@ -158,6 +169,26 @@ void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip); int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip); /* + * Safe version of (untrusted) xchk_iget that uses an empty transaction to + * avoid deadlocking on loops in the inobt. This should only be used in a + * scrub or repair setup routine, and only prior to grabbing a transaction. + */ +static inline int +xchk_iget_safe(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp) +{ + int error; + + ASSERT(sc->tp == NULL); + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + error = xchk_iget(sc, inum, ipp); + xchk_trans_cancel(sc); + return error; +} + +/* * Don't bother cross-referencing if we already found corruption or cross * referencing discrepancies. */ @@ -167,6 +198,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT); } +bool xchk_dir_looks_zapped(struct xfs_inode *dp); + #ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) @@ -175,8 +208,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT | XFS_SCRUB_OFLAG_PREEN); } + +/* + * "Should we prepare for a repair?" + * + * Return true if the caller permits us to repair metadata and we're not + * setting up for a post-repair evaluation. + */ +static inline bool xchk_could_repair(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc->flags & XREP_ALREADY_FIXED); +} #else # define xchk_needs_repair(sc) (false) +# define xchk_could_repair(sc) (false) #endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); @@ -188,6 +234,16 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); #define xchk_xfile_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \ (sc)->mp->m_super->s_id, ##__VA_ARGS__) +#define xchk_xfile_ag_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ + ##__VA_ARGS__) +#define xchk_xfile_ino_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \ + ##__VA_ARGS__) /* * Setting up a hook to wait for intents to drain is costly -- we have to take diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c new file mode 100644 index 000000000000..1e82c727af8e --- /dev/null +++ b/fs/xfs/scrub/cow_repair.c @@ -0,0 +1,614 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_icache.h" +#include "xfs_refcount_btree.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/off_bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/reap.h" + +/* + * CoW Fork Mapping Repair + * ======================= + * + * Although CoW staging extents are owned by incore CoW inode forks, on disk + * they are owned by the refcount btree. The ondisk metadata does not record + * any ownership information, which limits what we can do to repair the + * mappings in the CoW fork. At most, we can replace ifork mappings that lack + * an entry in the refcount btree or are described by a reverse mapping record + * whose owner is not OWN_COW. + * + * Replacing extents is also tricky -- we can't touch written CoW fork extents + * since they are undergoing writeback, and delalloc extents do not require + * repair since they only exist incore. Hence the most we can do is find the + * bad parts of unwritten mappings, allocate a replacement set of blocks, and + * replace the incore mapping. We use the regular reaping process to unmap + * or free the discarded blocks, as appropriate. + */ +struct xrep_cow { + struct xfs_scrub *sc; + + /* Bitmap of file offset ranges that need replacing. */ + struct xoff_bitmap bad_fileoffs; + + /* Bitmap of fsblocks that were removed from the CoW fork. */ + struct xfsb_bitmap old_cowfork_fsblocks; + + /* CoW fork mappings used to scan for bad CoW staging extents. */ + struct xfs_bmbt_irec irec; + + /* refcount btree block number of irec.br_startblock */ + unsigned int irec_startbno; + + /* refcount btree block number of the next refcount record we expect */ + unsigned int next_bno; +}; + +/* CoW staging extent. */ +struct xrep_cow_extent { + xfs_fsblock_t fsbno; + xfs_extlen_t len; +}; + +/* + * Mark the part of the file range that corresponds to the given physical + * space. Caller must ensure that the physical range is within xc->irec. + */ +STATIC int +xrep_cow_mark_file_range( + struct xrep_cow *xc, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + xfs_fileoff_t startoff; + + startoff = xc->irec.br_startoff + + (startblock - xc->irec.br_startblock); + + trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff, + blockcount); + + return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount); +} + +/* + * Trim @src to fit within the CoW fork mapping being examined, and put the + * result in @dst. + */ +static inline void +xrep_cow_trim_refcount( + struct xrep_cow *xc, + struct xfs_refcount_irec *dst, + const struct xfs_refcount_irec *src) +{ + unsigned int adj; + + memcpy(dst, src, sizeof(*dst)); + + if (dst->rc_startblock < xc->irec_startbno) { + adj = xc->irec_startbno - dst->rc_startblock; + dst->rc_blockcount -= adj; + dst->rc_startblock += adj; + } + + if (dst->rc_startblock + dst->rc_blockcount > + xc->irec_startbno + xc->irec.br_blockcount) { + adj = (dst->rc_startblock + dst->rc_blockcount) - + (xc->irec_startbno + xc->irec.br_blockcount); + dst->rc_blockcount -= adj; + } +} + +/* Mark any shared CoW staging extents. */ +STATIC int +xrep_cow_mark_shared_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + xfs_fsblock_t fsbno; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_SHARED) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + rrec.rc_startblock); + return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount); +} + +/* + * Mark any portion of the CoW fork file offset range where there is not a CoW + * staging extent record in the refcountbt, and keep a record of where we did + * find correct refcountbt records. Staging records are always cleaned out at + * mount time, so any two inodes trying to map the same staging area would have + * already taken the fs down due to refcount btree verifier errors. Hence this + * inode should be the sole creator of the staging extent records ondisk. + */ +STATIC int +xrep_cow_mark_missing_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + int error; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + if (xc->next_bno >= rrec.rc_startblock) + goto next; + + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + xc->next_bno), + rrec.rc_startblock - xc->next_bno); + if (error) + return error; + +next: + xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount; + return 0; +} + +/* + * Mark any area that does not correspond to a CoW staging rmap. These are + * cross-linked areas that must be avoided. + */ +STATIC int +xrep_cow_mark_missing_staging_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + xfs_fsblock_t fsbno; + xfs_agblock_t rec_bno; + xfs_extlen_t rec_len; + unsigned int adj; + + if (rec->rm_owner == XFS_RMAP_OWN_COW) + return 0; + + rec_bno = rec->rm_startblock; + rec_len = rec->rm_blockcount; + if (rec_bno < xc->irec_startbno) { + adj = xc->irec_startbno - rec_bno; + rec_len -= adj; + rec_bno += adj; + } + + if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) { + adj = (rec_bno + rec_len) - + (xc->irec_startbno + xc->irec.br_blockcount); + rec_len -= adj; + } + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno); + return xrep_cow_mark_file_range(xc, fsbno, rec_len); +} + +/* + * Find any part of the CoW fork mapping that isn't a single-owner CoW staging + * extent and mark the corresponding part of the file range in the bitmap. + */ +STATIC int +xrep_cow_find_bad( + struct xrep_cow *xc) +{ + struct xfs_refcount_irec rc_low = { 0 }; + struct xfs_refcount_irec rc_high = { 0 }; + struct xfs_rmap_irec rm_low = { 0 }; + struct xfs_rmap_irec rm_high = { 0 }; + struct xfs_perag *pag; + struct xfs_scrub *sc = xc->sc; + xfs_agnumber_t agno; + int error; + + agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock); + xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock); + + pag = xfs_perag_get(sc->mp, agno); + if (!pag) + return -EFSCORRUPTED; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + goto out_pag; + + /* Mark any CoW fork extents that are shared. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_shared_staging, xc); + if (error) + goto out_sa; + + /* Make sure there are CoW staging extents for the whole mapping. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; + xc->next_bno = xc->irec_startbno; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_missing_staging, xc); + if (error) + goto out_sa; + + if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, + xc->next_bno), + xc->irec_startbno + xc->irec.br_blockcount - + xc->next_bno); + if (error) + goto out_sa; + } + + /* Mark any area has an rmap that isn't a COW staging extent. */ + rm_low.rm_startblock = xc->irec_startbno; + memset(&rm_high, 0xFF, sizeof(rm_high)); + rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high, + xrep_cow_mark_missing_staging_rmap, xc); + if (error) + goto out_sa; + + /* + * If userspace is forcing us to rebuild the CoW fork or someone turned + * on the debugging knob, replace everything in the CoW fork. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, + xc->irec.br_blockcount); + if (error) + return error; + } + +out_sa: + xchk_ag_free(sc, &sc->sa); +out_pag: + xfs_perag_put(pag); + return 0; +} + +/* + * Allocate a replacement CoW staging extent of up to the given number of + * blocks, and fill out the mapping. + */ +STATIC int +xrep_cow_alloc( + struct xfs_scrub *sc, + xfs_extlen_t maxlen, + struct xrep_cow_extent *repl) +{ + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = sc->mp, + .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, + .minlen = 1, + .maxlen = maxlen, + .prod = 1, + .resv = XFS_AG_RESV_NONE, + .datatype = XFS_ALLOC_USERDATA, + }; + int error; + + error = xfs_trans_reserve_more(sc->tp, maxlen, 0); + if (error) + return error; + + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino)); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); + + repl->fsbno = args.fsbno; + repl->len = args.len; + return 0; +} + +/* + * Look up the current CoW fork mapping so that we only allocate enough to + * replace a single mapping. If we don't find a mapping that covers the start + * of the file range, or we find a delalloc or written extent, something is + * seriously wrong, since we didn't drop the ILOCK. + */ +static inline int +xrep_cow_find_mapping( + struct xrep_cow *xc, + struct xfs_iext_cursor *icur, + xfs_fileoff_t startoff, + struct xfs_bmbt_irec *got) +{ + struct xfs_inode *ip = xc->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); + + if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got)) + goto bad; + + if (got->br_startoff > startoff) + goto bad; + + if (got->br_blockcount == 0) + goto bad; + + if (isnullstartblock(got->br_startblock)) + goto bad; + + if (xfs_bmap_is_written_extent(got)) + goto bad; + + return 0; +bad: + ASSERT(0); + return -EFSCORRUPTED; +} + +#define REPLACE_LEFT_SIDE (1U << 0) +#define REPLACE_RIGHT_SIDE (1U << 1) + +/* + * Given a CoW fork mapping @got and a replacement mapping @repl, remap the + * beginning of @got with the space described by @rep. + */ +static inline void +xrep_cow_replace_mapping( + struct xfs_inode *ip, + struct xfs_iext_cursor *icur, + const struct xfs_bmbt_irec *got, + const struct xrep_cow_extent *repl) +{ + struct xfs_bmbt_irec new = *got; /* struct copy */ + + ASSERT(repl->len > 0); + ASSERT(!isnullstartblock(got->br_startblock)); + + trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len); + + if (got->br_blockcount == repl->len) { + /* + * The new extent is a complete replacement for the existing + * extent. Update the COW fork record. + */ + new.br_startblock = repl->fsbno; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + return; + } + + /* + * The new extent can replace the beginning of the COW fork record. + * Move the left side of @got upwards, then insert the new record. + */ + new.br_startoff += repl->len; + new.br_startblock += repl->len; + new.br_blockcount -= repl->len; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + + new.br_startoff = got->br_startoff; + new.br_startblock = repl->fsbno; + new.br_blockcount = repl->len; + xfs_iext_insert(ip, icur, &new, BMAP_COWFORK); +} + +/* + * Replace the unwritten CoW staging extent backing the given file range with a + * new space extent that isn't as problematic. + */ +STATIC int +xrep_cow_replace_range( + struct xrep_cow *xc, + xfs_fileoff_t startoff, + xfs_extlen_t *blockcount) +{ + struct xfs_iext_cursor icur; + struct xrep_cow_extent repl; + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = xc->sc; + xfs_fileoff_t nextoff; + xfs_extlen_t alloc_len; + int error; + + /* + * Put the existing CoW fork mapping in @got. If @got ends before + * @rep, truncate @rep so we only replace one extent mapping at a time. + */ + error = xrep_cow_find_mapping(xc, &icur, startoff, &got); + if (error) + return error; + nextoff = min(startoff + *blockcount, + got.br_startoff + got.br_blockcount); + + /* + * Allocate a replacement extent. If we don't fill all the blocks, + * shorten the quantity that will be deleted in this step. + */ + alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, + nextoff - startoff); + error = xrep_cow_alloc(sc, alloc_len, &repl); + if (error) + return error; + + /* + * Replace the old mapping with the new one, and commit the metadata + * changes made so far. + */ + xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl); + + xfs_inode_set_cowblocks_tag(sc->ip); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* Note the old CoW staging extents; we'll reap them all later. */ + error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, + repl.len); + if (error) + return error; + + *blockcount = repl.len; + return 0; +} + +/* + * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc + * reservation. + */ +STATIC int +xrep_cow_replace( + uint64_t startoff, + uint64_t blockcount, + void *priv) +{ + struct xrep_cow *xc = priv; + int error = 0; + + while (blockcount > 0) { + xfs_extlen_t len = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + error = xrep_cow_replace_range(xc, startoff, &len); + if (error) + break; + + blockcount -= len; + startoff += len; + } + + return error; +} + +/* + * Repair an inode's CoW fork. The CoW fork is an in-core structure, so + * there's no btree to rebuid. Instead, we replace any mappings that are + * cross-linked or lack ondisk CoW fork records in the refcount btree. + */ +int +xrep_bmap_cow( + struct xfs_scrub *sc) +{ + struct xrep_cow *xc; + struct xfs_iext_cursor icur; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK); + int error; + + if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp)) + return -EOPNOTSUPP; + + if (!ifp) + return 0; + + /* realtime files aren't supported yet */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + /* + * If we're somehow not in extents format, then reinitialize it to + * an empty extent mapping fork and exit. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + return 0; + } + + xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS); + if (!xc) + return -ENOMEM; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + xc->sc = sc; + xoff_bitmap_init(&xc->bad_fileoffs); + xfsb_bitmap_init(&xc->old_cowfork_fsblocks); + + for_each_xfs_iext(ifp, &icur, &xc->irec) { + if (xchk_should_terminate(sc, &error)) + goto out_bitmap; + + /* + * delalloc reservations only exist incore, so there is no + * ondisk metadata that we can examine. Hence we leave them + * alone. + */ + if (isnullstartblock(xc->irec.br_startblock)) + continue; + + /* + * COW fork extents are only in the written state if writeback + * is actively writing to disk. We cannot restart the write + * at a different disk address since we've already issued the + * IO, so we leave these alone and hope for the best. + */ + if (xfs_bmap_is_written_extent(&xc->irec)) + continue; + + error = xrep_cow_find_bad(xc); + if (error) + goto out_bitmap; + } + + /* Replace any bad unwritten mappings with fresh reservations. */ + error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc); + if (error) + goto out_bitmap; + + /* + * Reap as many of the old CoW blocks as we can. They are owned ondisk + * by the refcount btree, not the inode, so it is correct to treat them + * like inode metadata. + */ + error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, + &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); + xoff_bitmap_destroy(&xc->bad_fileoffs); + kmem_free(xc); + return error; +} diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 0b491784b759..d86ab51af928 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -15,10 +15,12 @@ #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/readdir.h" +#include "scrub/health.h" /* Set us up to scrub directories. */ int @@ -760,6 +762,11 @@ xchk_directory( if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) return -ENOENT; + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + /* Plausible size? */ if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -784,7 +791,36 @@ xchk_directory( /* Look up every name in this directory by hash. */ error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); - if (error == -ECANCELED) - error = 0; - return error; + if (error && error != -ECANCELED) + return error; + + /* If the dir is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED); + return 0; +} + +/* + * Decide if this directory has been zapped to satisfy the inode and ifork + * verifiers. Checking and repairing should be postponed until the directory + * is fixed. + */ +bool +xchk_dir_looks_zapped( + struct xfs_inode *dp) +{ + /* Repair zapped this dir's data fork a short time ago */ + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return true; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the bmapbtd + * scrubber to reconstruct the block mappings. Directories always + * contain some content, so this is a clear sign of a zapped directory. + * The state checked by xfs_ifork_zapped is not persisted, so this is + * the secondary strategy if repairs are interrupted by a crash or an + * unmount. + */ + return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS && + dp->i_df.if_nextents == 0; } diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c new file mode 100644 index 000000000000..20c4daedd48d --- /dev/null +++ b/fs/xfs/scrub/dqiterate.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" + +/* Initialize a dquot iteration cursor. */ +void +xchk_dqiter_init( + struct xchk_dqiter *cursor, + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + cursor->sc = sc; + cursor->bmap.br_startoff = NULLFILEOFF; + cursor->dqtype = dqtype & XFS_DQTYPE_REC_MASK; + cursor->quota_ip = xfs_quota_inode(sc->mp, cursor->dqtype); + cursor->id = 0; +} + +/* + * Ensure that the cached data fork mapping for the dqiter cursor is fresh and + * covers the dquot pointed to by the scan cursor. + */ +STATIC int +xchk_dquot_iter_revalidate_bmap( + struct xchk_dqiter *cursor) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + xfs_dqid_t this_id = cursor->id; + int nmaps = 1; + int error; + + fileoff = this_id / qi->qi_dqperchunk; + + /* + * If we have a mapping for cursor->id and it's still fresh, there's + * no need to reread the bmbt. + */ + if (cursor->bmap.br_startoff != NULLFILEOFF && + cursor->if_seq == ifp->if_seq && + cursor->bmap.br_startoff + cursor->bmap.br_blockcount > fileoff) + return 0; + + /* Look up the data fork mapping for the dquot id of interest. */ + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, &nmaps, 0); + if (error) + return error; + if (!nmaps) { + ASSERT(nmaps > 0); + return -EFSCORRUPTED; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_revalidate_bmap(cursor, cursor->id); + return 0; +} + +/* Advance the dqiter cursor to the next non-sparse region of the quota file. */ +STATIC int +xchk_dquot_iter_advance_bmap( + struct xchk_dqiter *cursor, + uint64_t *next_ondisk_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + uint64_t next_id; + int nmaps = 1; + int error; + + /* Find the dquot id for the next non-hole mapping. */ + do { + fileoff = cursor->bmap.br_startoff + cursor->bmap.br_blockcount; + if (fileoff > XFS_DQ_ID_MAX / qi->qi_dqperchunk) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, + &nmaps, 0); + if (error) + return error; + if (!nmaps) { + /* Must have reached the end of the mappings. */ + *next_ondisk_id = -1ULL; + return 0; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + } while (!xfs_bmap_is_real_extent(&cursor->bmap)); + + next_id = cursor->bmap.br_startoff * qi->qi_dqperchunk; + if (next_id > XFS_DQ_ID_MAX) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + /* Propose jumping forward to the dquot in the next allocated block. */ + *next_ondisk_id = next_id; + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_advance_bmap(cursor, *next_ondisk_id); + return 0; +} + +/* + * Find the id of the next highest incore dquot. Normally this will correspond + * exactly with the quota file block mappings, but repair might have erased a + * mapping because it was crosslinked; in that case, we need to re-allocate the + * space so that we can reset q_blkno. + */ +STATIC void +xchk_dquot_iter_advance_incore( + struct xchk_dqiter *cursor, + uint64_t *next_incore_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, cursor->dqtype); + struct xfs_dquot *dq; + unsigned int nr_found; + + *next_incore_id = -1ULL; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)&dq, cursor->id, 1); + if (nr_found) + *next_incore_id = dq->q_id; + mutex_unlock(&qi->qi_tree_lock); + + trace_xchk_dquot_iter_advance_incore(cursor, *next_incore_id); +} + +/* + * Walk all incore dquots of this filesystem. Caller must set *@cursorp to + * zero before the first call, and must not hold the quota file ILOCK. + * Returns 1 and a valid *@dqpp; 0 and *@dqpp == NULL when there are no more + * dquots to iterate; or a negative errno. + */ +int +xchk_dquot_iter( + struct xchk_dqiter *cursor, + struct xfs_dquot **dqpp) +{ + struct xfs_mount *mp = cursor->sc->mp; + struct xfs_dquot *dq = NULL; + uint64_t next_ondisk, next_incore = -1ULL; + unsigned int lock_mode; + int error = 0; + + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + next_ondisk = cursor->id; + + /* Revalidate and/or advance the cursor. */ + lock_mode = xfs_ilock_data_map_shared(cursor->quota_ip); + error = xchk_dquot_iter_revalidate_bmap(cursor); + if (!error && !xfs_bmap_is_real_extent(&cursor->bmap)) + error = xchk_dquot_iter_advance_bmap(cursor, &next_ondisk); + xfs_iunlock(cursor->quota_ip, lock_mode); + if (error) + return error; + + if (next_ondisk > cursor->id) + xchk_dquot_iter_advance_incore(cursor, &next_incore); + + /* Pick the next dquot in the sequence and return it. */ + cursor->id = min(next_ondisk, next_incore); + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + + trace_xchk_dquot_iter(cursor, cursor->id); + + error = xfs_qm_dqget(mp, cursor->id, cursor->dqtype, false, &dq); + if (error) + return error; + + cursor->id = dq->q_id + 1; + *dqpp = dq; + return 1; +} diff --git a/fs/xfs/scrub/fsb_bitmap.h b/fs/xfs/scrub/fsb_bitmap.h new file mode 100644 index 000000000000..40b462c1dd0d --- /dev/null +++ b/fs/xfs/scrub/fsb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FSB_BITMAP_H__ +#define __XFS_SCRUB_FSB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fsblock_t */ + +struct xfsb_bitmap { + struct xbitmap64 fsbitmap; +}; + +static inline void xfsb_bitmap_init(struct xfsb_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->fsbitmap); +} + +static inline void xfsb_bitmap_destroy(struct xfsb_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->fsbitmap); +} + +static inline int xfsb_bitmap_set(struct xfsb_bitmap *bitmap, + xfs_fsblock_t start, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->fsbitmap, start, len); +} + +static inline int xfsb_bitmap_walk(struct xfsb_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->fsbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_FSB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 5e2b09ed6e29..531006910ca9 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -10,8 +10,6 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" #include "xfs_ag.h" #include "xfs_health.h" #include "scrub/scrub.h" @@ -118,6 +116,38 @@ xchk_health_mask_for_scrub_type( } /* + * If the scrub state is clean, add @mask to the scrub sick mask to clear + * additional sick flags from the metadata object's sick state. + */ +void +xchk_mark_healthy_if_clean( + struct xfs_scrub *sc, + unsigned int mask) +{ + if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT))) + sc->sick_mask |= mask; +} + +/* + * If we're scrubbing a piece of file metadata for the first time, does it look + * like it has been zapped? Skip the check if we just repaired the metadata + * and are revalidating it. + */ +bool +xchk_file_looks_zapped( + struct xfs_scrub *sc, + unsigned int mask) +{ + ASSERT((mask & ~XFS_SICK_INO_ZAPPED) == 0); + + if (sc->flags & XREP_ALREADY_FIXED) + return false; + + return xfs_inode_has_sickness(sc->ip, mask); +} + +/* * Update filesystem health assessments based on what we found and did. * * If the scrubber finds errors, we mark sick whatever's mentioned in diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index 66a273f8585b..a731b2467399 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -10,5 +10,7 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); void xchk_update_health(struct xfs_scrub *sc); bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, xfs_btnum_t btnum); +void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask); +bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask); #endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index fb7bbf47ae5d..a720fc62262a 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -585,7 +585,7 @@ xchk_iallocbt_rec( uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) { + if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -708,11 +708,10 @@ xchk_iallocbt_xref_rmap_inodes( xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } -/* Scrub the inode btrees for some AG. */ -STATIC int +/* Scrub one of the inode btrees for some AG. */ +int xchk_iallocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { @@ -720,9 +719,23 @@ xchk_iallocbt( .next_startino = NULLAGINO, .next_cluster_ino = NULLAGINO, }; + xfs_btnum_t which; int error; - cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_INOBT: + cur = sc->sa.ino_cur; + which = XFS_BTNUM_INO; + break; + case XFS_SCRUB_TYPE_FINOBT: + cur = sc->sa.fino_cur; + which = XFS_BTNUM_FINO; + break; + default: + ASSERT(0); + return -EIO; + } + error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT, &iabt); if (error) @@ -743,20 +756,6 @@ xchk_iallocbt( return error; } -int -xchk_inobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_INO); -} - -int -xchk_finobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_FINO); -} - /* See if an inode btree has (or doesn't have) an inode chunk record. */ static inline void xchk_xref_inode_check( diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c new file mode 100644 index 000000000000..b3f7182dd2f5 --- /dev/null +++ b/fs/xfs/scrub/ialloc_repair.c @@ -0,0 +1,884 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Btree Repair + * ================== + * + * A quick refresher of inode btrees on a v5 filesystem: + * + * - Inode records are read into memory in units of 'inode clusters'. However + * many inodes fit in a cluster buffer is the smallest number of inodes that + * can be allocated or freed. Clusters are never smaller than one fs block + * though they can span multiple blocks. The size (in fs blocks) is + * computed with xfs_icluster_size_fsb(). The fs block alignment of a + * cluster is computed with xfs_ialloc_cluster_alignment(). + * + * - Each inode btree record can describe a single 'inode chunk'. The chunk + * size is defined to be 64 inodes. If sparse inodes are enabled, every + * inobt record must be aligned to the chunk size; if not, every record must + * be aligned to the start of a cluster. It is possible to construct an XFS + * geometry where one inobt record maps to multiple inode clusters; it is + * also possible to construct a geometry where multiple inobt records map to + * different parts of one inode cluster. + * + * - If sparse inodes are not enabled, the smallest unit of allocation for + * inode records is enough to contain one inode chunk's worth of inodes. + * + * - If sparse inodes are enabled, the holemask field will be active. Each + * bit of the holemask represents 4 potential inodes; if set, the + * corresponding space does *not* contain inodes and must be left alone. + * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation + * of inode records is one inode cluster. + * + * So what's the rebuild algorithm? + * + * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT + * records. The OWN_INOBT records are the old inode btree blocks and will be + * cleared out after we've rebuilt the tree. Each possible inode cluster + * within an OWN_INODES record will be read in; for each possible inobt record + * associated with that cluster, compute the freemask calculated from the + * i_mode data in the inode chunk. For sparse inodes the holemask will be + * calculated by creating the properly aligned inobt record and punching out + * any chunk that's missing. Inode allocations and frees grab the AGI first, + * so repair protects itself from concurrent access by locking the AGI. + * + * Once we've reconstructed all the inode records, we can create new inode + * btree roots and reload the btrees. We rebuild both inode trees at the same + * time because they have the same rmap owner and it would be more complex to + * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT + * blocks it owns. We have all the data we need to build both, so dump + * everything and start over. + * + * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once. + */ + +struct xrep_ibt { + /* Record under construction. */ + struct xfs_inobt_rec_incore rie; + + /* new inobt information */ + struct xrep_newbt new_inobt; + + /* new finobt information */ + struct xrep_newbt new_finobt; + + /* Old inode btree blocks we found in the rmap. */ + struct xagb_bitmap old_iallocbt_blocks; + + /* Reconstructed inode records. */ + struct xfarray *inode_records; + + struct xfs_scrub *sc; + + /* Number of inodes assigned disk space. */ + unsigned int icount; + + /* Number of inodes in use. */ + unsigned int iused; + + /* Number of finobt records needed. */ + unsigned int finobt_recs; + + /* get_records()'s position in the inode record array. */ + xfarray_idx_t array_cur; +}; + +/* + * Is this inode in use? If the inode is in memory we can tell from i_mode, + * otherwise we have to check di_mode in the on-disk buffer. We only care + * that the high (i.e. non-permission) bits of _mode are zero. This should be + * safe because repair keeps all AG headers locked until the end, and process + * trying to perform an inode allocation/free must lock the AGI. + * + * @cluster_ag_base is the inode offset of the cluster within the AG. + * @cluster_bp is the cluster buffer. + * @cluster_index is the inode offset within the inode cluster. + */ +STATIC int +xrep_ibt_check_ifree( + struct xrep_ibt *ri, + xfs_agino_t cluster_ag_base, + struct xfs_buf *cluster_bp, + unsigned int cluster_index, + bool *inuse) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dinode *dip; + xfs_ino_t fsino; + xfs_agino_t agino; + xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; + unsigned int cluster_buf_base; + unsigned int offset; + int error; + + agino = cluster_ag_base + cluster_index; + fsino = XFS_AGINO_TO_INO(mp, agno, agino); + + /* Inode uncached or half assembled, read disk buffer */ + cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); + offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; + if (offset >= BBTOB(cluster_bp->b_length)) + return -EFSCORRUPTED; + dip = xfs_buf_offset(cluster_bp, offset); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) + return -EFSCORRUPTED; + + if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) + return -EFSCORRUPTED; + + /* Will the in-core inode tell us if it's in use? */ + error = xchk_inode_is_allocated(sc, agino, inuse); + if (!error) + return 0; + + *inuse = dip->di_mode != 0; + return 0; +} + +/* Stash the accumulated inobt record for rebuilding. */ +STATIC int +xrep_ibt_stash( + struct xrep_ibt *ri) +{ + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie); + if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL) + return -EFSCORRUPTED; + + if (ri->rie.ir_freecount > 0) + ri->finobt_recs++; + + trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); + + error = xfarray_append(ri->inode_records, &ri->rie); + if (error) + return error; + + ri->rie.ir_startino = NULLAGINO; + return 0; +} + +/* + * Given an extent of inodes and an inode cluster buffer, calculate the + * location of the corresponding inobt record (creating it if necessary), + * then update the parts of the holemask and freemask of that record that + * correspond to the inode extent we were given. + * + * @cluster_ir_startino is the AG inode number of an inobt record that we're + * proposing to create for this inode cluster. If sparse inodes are enabled, + * we must round down to a chunk boundary to find the actual sparse record. + * @cluster_bp is the buffer of the inode cluster. + * @nr_inodes is the number of inodes to check from the cluster. + */ +STATIC int +xrep_ibt_cluster_record( + struct xrep_ibt *ri, + xfs_agino_t cluster_ir_startino, + struct xfs_buf *cluster_bp, + unsigned int nr_inodes) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + xfs_agino_t ir_startino; + unsigned int cluster_base; + unsigned int cluster_index; + int error = 0; + + ir_startino = cluster_ir_startino; + if (xfs_has_sparseinodes(mp)) + ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK); + cluster_base = cluster_ir_startino - ir_startino; + + /* + * If the accumulated inobt record doesn't map this cluster, add it to + * the list and reset it. + */ + if (ri->rie.ir_startino != NULLAGINO && + ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) { + error = xrep_ibt_stash(ri); + if (error) + return error; + } + + if (ri->rie.ir_startino == NULLAGINO) { + ri->rie.ir_startino = ir_startino; + ri->rie.ir_free = XFS_INOBT_ALL_FREE; + ri->rie.ir_holemask = 0xFFFF; + ri->rie.ir_count = 0; + } + + /* Record the whole cluster. */ + ri->icount += nr_inodes; + ri->rie.ir_count += nr_inodes; + ri->rie.ir_holemask &= ~xfs_inobt_maskn( + cluster_base / XFS_INODES_PER_HOLEMASK_BIT, + nr_inodes / XFS_INODES_PER_HOLEMASK_BIT); + + /* Which inodes within this cluster are free? */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + bool inuse = false; + + error = xrep_ibt_check_ifree(ri, cluster_ir_startino, + cluster_bp, cluster_index, &inuse); + if (error) + return error; + if (!inuse) + continue; + ri->iused++; + ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base + + cluster_index); + } + return 0; +} + +/* + * For each inode cluster covering the physical extent recorded by the rmapbt, + * we must calculate the properly aligned startino of that cluster, then + * iterate each cluster to fill in used and filled masks appropriately. We + * then use the (startino, used, filled) information to construct the + * appropriate inode records. + */ +STATIC int +xrep_ibt_process_cluster( + struct xrep_ibt *ri, + xfs_agblock_t cluster_bno) +{ + struct xfs_imap imap; + struct xfs_buf *cluster_bp; + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t cluster_ag_base; + xfs_agino_t irec_index; + unsigned int nr_inodes; + int error; + + nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster, + XFS_INODES_PER_CHUNK); + + /* + * Grab the inode cluster buffer. This is safe to do with a broken + * inobt because imap_to_bp directly maps the buffer without touching + * either inode btree. + */ + imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); + imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); + imap.im_boffset = 0; + error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); + if (error) + return error; + + /* + * Record the contents of each possible inobt record mapping this + * cluster. + */ + cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno); + for (irec_index = 0; + irec_index < igeo->inodes_per_cluster; + irec_index += XFS_INODES_PER_CHUNK) { + error = xrep_ibt_cluster_record(ri, + cluster_ag_base + irec_index, cluster_bp, + nr_inodes); + if (error) + break; + + } + + xfs_trans_brelse(sc->tp, cluster_bp); + return error; +} + +/* Check for any obvious conflicts in the inode chunk extent. */ +STATIC int +xrep_ibt_check_inode_ext( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t agino; + enum xbtree_recpacking outcome; + int error; + + /* Inode records must be within the AG. */ + if (!xfs_verify_agbext(sc->sa.pag, agbno, len)) + return -EFSCORRUPTED; + + /* The entire record must align to the inode cluster size. */ + if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) || + !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster)) + return -EFSCORRUPTED; + + /* + * The entire record must also adhere to the inode cluster alignment + * size if sparse inodes are not enabled. + */ + if (!xfs_has_sparseinodes(mp) && + (!IS_ALIGNED(agbno, igeo->cluster_align) || + !IS_ALIGNED(agbno + len, igeo->cluster_align))) + return -EFSCORRUPTED; + + /* + * On a sparse inode fs, this cluster could be part of a sparse chunk. + * Sparse clusters must be aligned to sparse chunk alignment. + */ + if (xfs_has_sparseinodes(mp) && + (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) || + !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) + return -EFSCORRUPTED; + + /* Make sure the entire range of blocks are valid AG inodes. */ + agino = XFS_AGB_TO_AGINO(mp, agbno); + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1; + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Found a fragment of the old inode btrees; dispose of them later. */ +STATIC int +xrep_ibt_record_old_btree_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock, + rec->rm_blockcount); +} + +/* Record extents that belong to inode cluster blocks. */ +STATIC int +xrep_ibt_record_inode_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + struct xfs_mount *mp = ri->sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t cluster_base; + int error; + + error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock, + rec->rm_blockcount); + if (error) + return error; + + trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, + rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, + rec->rm_offset, rec->rm_flags); + + /* + * Record the free/hole masks for each inode cluster that could be + * mapped by this rmap record. + */ + for (cluster_base = 0; + cluster_base < rec->rm_blockcount; + cluster_base += igeo->blocks_per_cluster) { + error = xrep_ibt_process_cluster(ri, + rec->rm_startblock + cluster_base); + if (error) + return error; + } + + return 0; +} + +STATIC int +xrep_ibt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_ibt *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + switch (rec->rm_owner) { + case XFS_RMAP_OWN_INOBT: + return xrep_ibt_record_old_btree_blocks(ri, rec); + case XFS_RMAP_OWN_INODES: + return xrep_ibt_record_inode_blocks(ri, rec); + } + return 0; +} + +/* + * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode + * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct + * the inode btrees. The caller must clean up the lists if anything goes + * wrong. + */ +STATIC int +xrep_ibt_find_inodes( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + ri->rie.ir_startino = NULLAGINO; + + /* Collect all reverse mappings for inode blocks. */ + xrep_ag_btcur_init(sc, &sc->sa); + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri); + xchk_ag_btcur_free(&sc->sa); + if (error) + return error; + + /* If we have a record ready to go, add it to the array. */ + if (ri->rie.ir_startino != NULLAGINO) + return xrep_ibt_stash(ri); + + return 0; +} + +/* Update the AGI counters. */ +STATIC int +xrep_ibt_reset_counters( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + unsigned int freecount = ri->icount - ri->iused; + + /* Trigger inode count recalculation */ + xfs_force_summary_recalc(sc->mp); + + /* + * The AGI header contains extra information related to the inode + * btrees, so we must update those fields here. + */ + agi->agi_count = cpu_to_be32(ri->icount); + agi->agi_freecount = cpu_to_be32(freecount); + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, + XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagi(sc); +} + +/* Retrieve finobt data for bulk load. */ +STATIC int +xrep_fibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(ri->inode_records, + ri->array_cur++, irec); + } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Retrieve inobt data for bulk load. */ +STATIC int +xrep_ibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(ri->inode_records, ri->array_cur++, irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new inobt blocks to the bulk loader. */ +STATIC int +xrep_ibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr); +} + +/* Feed one of the new finobt blocks to the bulk loader. */ +STATIC int +xrep_fibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr); +} + +/* Make sure the records do not overlap in inumber address space. */ +STATIC int +xrep_ibt_check_overlap( + struct xrep_ibt *ri) +{ + struct xfs_inobt_rec_incore irec; + xfarray_idx_t cur; + xfs_agino_t next_agino = 0; + int error = 0; + + foreach_xfarray_idx(ri->inode_records, cur) { + if (xchk_should_terminate(ri->sc, &error)) + return error; + + error = xfarray_load(ri->inode_records, cur, &irec); + if (error) + return error; + + if (irec.ir_startino < next_agino) + return -EFSCORRUPTED; + + next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK; + } + + return error; +} + +/* Build new inode btrees and dispose of the old one. */ +STATIC int +xrep_ibt_build_new_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_btree_cur *ino_cur; + struct xfs_btree_cur *fino_cur = NULL; + xfs_fsblock_t fsbno; + bool need_finobt; + int error; + + need_finobt = xfs_has_finobt(sc->mp); + + /* + * Create new btrees for staging all the inobt records we collected + * earlier. The records were collected in order of increasing agino, + * so we do not have to sort them. Ensure there are no overlapping + * records. + */ + error = xrep_ibt_check_overlap(ri); + if (error) + return error; + + /* + * The new inode btrees will not be rooted in the AGI until we've + * successfully rebuilt the tree. + * + * Start by setting up the inobt staging cursor. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_IBT_BLOCK(sc->mp)), + xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, + XFS_AG_RESV_NONE); + ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; + ri->new_inobt.bload.get_records = xrep_ibt_get_records; + + ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake, + XFS_BTNUM_INO); + error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, + xfarray_length(ri->inode_records)); + if (error) + goto err_inocur; + + /* Set up finobt staging cursor. */ + if (need_finobt) { + enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA; + + if (sc->mp->m_finobt_nores) + resv = XFS_AG_RESV_NONE; + + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_FIBT_BLOCK(sc->mp)), + xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, + fsbno, resv); + ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; + ri->new_finobt.bload.get_records = xrep_fibt_get_records; + + fino_cur = xfs_inobt_stage_cursor(sc->sa.pag, + &ri->new_finobt.afake, XFS_BTNUM_FINO); + error = xfs_btree_bload_compute_geometry(fino_cur, + &ri->new_finobt.bload, ri->finobt_recs); + if (error) + goto err_finocur; + } + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_finocur; + + /* Reserve all the space we need to build the new btrees. */ + error = xrep_newbt_alloc_blocks(&ri->new_inobt, + ri->new_inobt.bload.nr_blocks); + if (error) + goto err_finocur; + + if (need_finobt) { + error = xrep_newbt_alloc_blocks(&ri->new_finobt, + ri->new_finobt.bload.nr_blocks); + if (error) + goto err_finocur; + } + + /* Add all inobt records. */ + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri); + if (error) + goto err_finocur; + + /* Add all finobt records. */ + if (need_finobt) { + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri); + if (error) + goto err_finocur; + } + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(ino_cur, 0); + + if (fino_cur) { + xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(fino_cur, 0); + } + + /* Reset the AGI counters now that we've changed the inode roots. */ + error = xrep_ibt_reset_counters(ri); + if (error) + goto err_finobt; + + /* Free unused blocks and bitmap. */ + if (need_finobt) { + error = xrep_newbt_commit(&ri->new_finobt); + if (error) + goto err_inobt; + } + error = xrep_newbt_commit(&ri->new_inobt); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_finocur: + if (need_finobt) + xfs_btree_del_cursor(fino_cur, error); +err_inocur: + xfs_btree_del_cursor(ino_cur, error); +err_finobt: + if (need_finobt) + xrep_newbt_cancel(&ri->new_finobt); +err_inobt: + xrep_newbt_cancel(&ri->new_inobt); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_ibt_remove_old_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + /* + * Free the old inode btree blocks if they're not in use. It's ok to + * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG + * reservation because we reset the reservation before releasing the + * AGI and AGF header buffer locks. + */ + error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks, + &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE); + if (error) + return error; + + /* + * If the finobt is enabled and has a per-AG reservation, make sure we + * reinitialize the per-AG reservations. + */ + if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores) + sc->flags |= XREP_RESET_PERAG_RESV; + + return 0; +} + +/* Repair both inode btrees. */ +int +xrep_iallocbt( + struct xfs_scrub *sc) +{ + struct xrep_ibt *ri; + struct xfs_mount *mp = sc->mp; + char *descr; + xfs_agino_t first_agino, last_agino; + int error = 0; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS); + if (!ri) + return -ENOMEM; + ri->sc = sc; + + /* We rebuild both inode btrees. */ + sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; + + /* Set up enough storage to handle an AG with nothing but inodes. */ + xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); + last_agino /= XFS_INODES_PER_CHUNK; + descr = xchk_xfile_ag_descr(sc, "inode index records"); + error = xfarray_create(descr, last_agino, + sizeof(struct xfs_inobt_rec_incore), + &ri->inode_records); + kfree(descr); + if (error) + goto out_ri; + + /* Collect the inode data and find the old btree blocks. */ + xagb_bitmap_init(&ri->old_iallocbt_blocks); + error = xrep_ibt_find_inodes(ri); + if (error) + goto out_bitmap; + + /* Rebuild the inode indexes. */ + error = xrep_ibt_build_new_trees(ri); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_ibt_remove_old_trees(ri); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ri->old_iallocbt_blocks); + xfarray_destroy(ri->inode_records); +out_ri: + kfree(ri); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_iallocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT; + error = xchk_iallocbt(sc); + if (error) + goto out; + + if (xfs_has_finobt(sc->mp)) { + sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT; + error = xchk_iallocbt(sc); + } + +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 889f556bc98f..6e2fe2d6250b 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -25,6 +25,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "scrub/repair.h" /* Prepare the attached inode for scrubbing. */ static inline int @@ -39,6 +40,10 @@ xchk_prepare_iscrub( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } @@ -95,8 +100,8 @@ xchk_setup_inode( if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; - /* Try a regular untrusted iget. */ - error = xchk_iget(sc, sc->sm->sm_ino, &ip); + /* Try a safe untrusted iget. */ + error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); if (!error) return xchk_install_handle_iscrub(sc, ip); if (error == -ENOENT) @@ -181,8 +186,11 @@ xchk_setup_inode( * saying the inode is allocated and the icache being unable to load * the inode until we can flag the corruption in xchk_inode. The * scrub function has to note the corruption, since we're not really - * supposed to do that from the setup function. + * supposed to do that from the setup function. Save the mapping to + * make repairs to the ondisk inode buffer. */ + if (xchk_could_repair(sc)) + xrep_setup_inode(sc, &imap); return 0; out_cancel: @@ -338,6 +346,10 @@ xchk_inode_flags2( if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) goto bad; + /* no large extent counts without the filesystem feature */ + if ((flags2 & XFS_DIFLAG2_NREXT64) && !xfs_has_large_extent_counts(mp)) + goto bad; + return; bad: xchk_ino_set_corrupt(sc, ino); @@ -548,7 +560,7 @@ xchk_dinode( } /* di_forkoff */ - if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) + if (XFS_DFORK_BOFF(dip) >= mp->m_sb.sb_inodesize) xchk_ino_set_corrupt(sc, ino); if (naextents != 0 && dip->di_forkoff == 0) xchk_ino_set_corrupt(sc, ino); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c new file mode 100644 index 000000000000..0ca62d59f84a --- /dev/null +++ b/fs/xfs/scrub/inode_repair.c @@ -0,0 +1,1525 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_ialloc.h" +#include "xfs_da_format.h" +#include "xfs_reflink.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_attr_leaf.h" +#include "xfs_log_priv.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Inode Record Repair + * =================== + * + * Roughly speaking, inode problems can be classified based on whether or not + * they trip the dinode verifiers. If those trip, then we won't be able to + * xfs_iget ourselves the inode. + * + * Therefore, the xrep_dinode_* functions fix anything that will cause the + * inode buffer verifier or the dinode verifier. The xrep_inode_* functions + * fix things on live incore inodes. The inode repair functions make decisions + * with security and usability implications when reviving a file: + * + * - Files with zero di_mode or a garbage di_mode are converted to regular file + * that only root can read. This file may not actually contain user data, + * if the file was not previously a regular file. Setuid and setgid bits + * are cleared. + * + * - Zero-size directories can be truncated to look empty. It is necessary to + * run the bmapbtd and directory repair functions to fully rebuild the + * directory. + * + * - Zero-size symbolic link targets can be truncated to '?'. It is necessary + * to run the bmapbtd and symlink repair functions to salvage the symlink. + * + * - Invalid extent size hints will be removed. + * + * - Quotacheck will be scheduled if we repaired an inode that was so badly + * damaged that the ondisk inode had to be rebuilt. + * + * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. + * Setuid and setgid bits are cleared. + * + * - Data and attr forks are reset to extents format with zero extents if the + * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta + * repair functions to recover the space mapping. + * + * - ACLs will not be recovered if the attr fork is zapped or the extended + * attribute structure itself requires salvaging. + * + * - If the attr fork is zapped, the user and group ids are reset to root and + * the setuid and setgid bits are removed. + */ + +/* + * All the information we need to repair the ondisk inode if we can't iget the + * incore inode. We don't allocate this buffer unless we're going to perform + * a repair to the ondisk inode cluster buffer. + */ +struct xrep_inode { + /* Inode mapping that we saved from the initial lookup attempt. */ + struct xfs_imap imap; + + struct xfs_scrub *sc; + + /* Blocks in use on the data device by data extents or bmbt blocks. */ + xfs_rfsblock_t data_blocks; + + /* Blocks in use on the rt device. */ + xfs_rfsblock_t rt_blocks; + + /* Blocks in use by the attr fork. */ + xfs_rfsblock_t attr_blocks; + + /* Number of data device extents for the data fork. */ + xfs_extnum_t data_extents; + + /* + * Number of realtime device extents for the data fork. If + * data_extents and rt_extents indicate that the data fork has extents + * on both devices, we'll just back away slowly. + */ + xfs_extnum_t rt_extents; + + /* Number of (data device) extents for the attr fork. */ + xfs_aextnum_t attr_extents; + + /* Sick state to set after zapping parts of the inode. */ + unsigned int ino_sick_mask; + + /* Must we remove all access from this file? */ + bool zap_acls; +}; + +/* + * Setup function for inode repair. @imap contains the ondisk inode mapping + * information so that we can correct the ondisk inode cluster buffer if + * necessary to make iget work. + */ +int +xrep_setup_inode( + struct xfs_scrub *sc, + const struct xfs_imap *imap) +{ + struct xrep_inode *ri; + + sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + ri = sc->buf; + memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); + ri->sc = sc; + return 0; +} + +/* + * Make sure this ondisk inode can pass the inode buffer verifier. This is + * not the same as the dinode verifier. + */ +STATIC void +xrep_dinode_buf_core( + struct xfs_scrub *sc, + struct xfs_buf *bp, + unsigned int ioffset) +{ + struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); + struct xfs_trans *tp = sc->tp; + struct xfs_mount *mp = sc->mp; + xfs_agino_t agino; + bool crc_ok = false; + bool magic_ok = false; + bool unlinked_ok = false; + + agino = be32_to_cpu(dip->di_next_unlinked); + + if (xfs_verify_agino_or_null(bp->b_pag, agino)) + unlinked_ok = true; + + if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + xfs_dinode_good_version(mp, dip->di_version)) + magic_ok = true; + + if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + crc_ok = true; + + if (magic_ok && unlinked_ok && crc_ok) + return; + + if (!magic_ok) { + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + dip->di_version = 3; + } + if (!unlinked_ok) + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + xfs_dinode_calc_crc(mp, dip); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(tp, bp, ioffset, + ioffset + sizeof(struct xfs_dinode) - 1); +} + +/* Make sure this inode cluster buffer can pass the inode buffer verifier. */ +STATIC void +xrep_dinode_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = sc->mp; + int i; + int ni; + + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) + xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); +} + +/* Reinitialize things that never change in an inode. */ +STATIC void +xrep_dinode_header( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + trace_xrep_dinode_header(sc, dip); + + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + if (!xfs_dinode_good_version(sc->mp, dip->di_version)) + dip->di_version = 3; + dip->di_ino = cpu_to_be64(sc->sm->sm_ino); + uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); + dip->di_gen = cpu_to_be32(sc->sm->sm_gen); +} + +/* Turn di_mode into /something/ recognizable. */ +STATIC void +xrep_dinode_mode( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_mode(sc, dip); + + if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) + return; + + /* bad mode, so we set it to a file that only root can read */ + mode = S_IFREG; + dip->di_mode = cpu_to_be16(mode); + dip->di_uid = 0; + dip->di_gid = 0; + ri->zap_acls = true; +} + +/* Fix any conflicting flags that the verifiers complain about. */ +STATIC void +xrep_dinode_flags( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + bool isrt) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_flags(sc, dip); + + if (isrt) + flags |= XFS_DIFLAG_REALTIME; + else + flags &= ~XFS_DIFLAG_REALTIME; + + /* + * For regular files on a reflink filesystem, set the REFLINK flag to + * protect shared extents. A later stage will actually check those + * extents and clear the flag if possible. + */ + if (xfs_has_reflink(mp) && S_ISREG(mode)) + flags2 |= XFS_DIFLAG2_REFLINK; + else + flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); + if (flags & XFS_DIFLAG_REALTIME) + flags2 &= ~XFS_DIFLAG2_REFLINK; + if (!xfs_has_bigtime(mp)) + flags2 &= ~XFS_DIFLAG2_BIGTIME; + if (!xfs_has_large_extent_counts(mp)) + flags2 &= ~XFS_DIFLAG2_NREXT64; + if (flags2 & XFS_DIFLAG2_NREXT64) + dip->di_nrext64_pad = 0; + else if (dip->di_version >= 3) + dip->di_v3_pad = 0; + dip->di_flags = cpu_to_be16(flags); + dip->di_flags2 = cpu_to_be64(flags2); +} + +/* + * Blow out symlink; now it points nowhere. We don't have to worry about + * incore state because this inode is failing the verifiers. + */ +STATIC void +xrep_dinode_zap_symlink( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + char *p; + + trace_xrep_dinode_zap_symlink(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + dip->di_size = cpu_to_be64(1); + p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + *p = '?'; + ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; +} + +/* + * Blow out dir, make the parent point to the root. In the future repair will + * reconstruct this directory for us. Note that there's no in-core directory + * inode because the sf verifier tripped, so we don't have to worry about the + * dentry cache. + */ +STATIC void +xrep_dinode_zap_dir( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dir2_sf_hdr *sfp; + int i8count; + + trace_xrep_dinode_zap_dir(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; + sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + sfp->count = 0; + sfp->i8count = i8count; + xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); + dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); + ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; +} + +/* Make sure we don't have a garbage file size. */ +STATIC void +xrep_dinode_size( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + uint64_t size = be64_to_cpu(dip->di_size); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_size(sc, dip); + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + /* di_size can't be nonzero for special files */ + dip->di_size = 0; + break; + case S_IFREG: + /* Regular files can't be larger than 2^63-1 bytes. */ + dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); + break; + case S_IFLNK: + /* + * Truncate ridiculously oversized symlinks. If the size is + * zero, reset it to point to the current directory. Both of + * these conditions trigger dinode verifier errors, so there + * is no in-core state to reset. + */ + if (size > XFS_SYMLINK_MAXLEN) + dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); + else if (size == 0) + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + /* + * Directories can't have a size larger than 32G. If the size + * is zero, reset it to an empty directory. Both of these + * conditions trigger dinode verifier errors, so there is no + * in-core state to reset. + */ + if (size > XFS_DIR2_SPACE_SIZE) + dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); + else if (size == 0) + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* Fix extent size hints. */ +STATIC void +xrep_dinode_extsize_hints( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + xfs_failaddr_t fa; + + trace_xrep_dinode_extsize_hints(sc, dip); + + fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), + mode, flags); + if (fa) { + dip->di_extsize = 0; + dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + } + + if (dip->di_version < 3) + return; + + fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) { + dip->di_cowextsize = 0; + dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); + } +} + +/* Count extents and blocks for an inode given an rmap. */ +STATIC int +xrep_dinode_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + /* We only care about this inode. */ + if (rec->rm_owner != ri->sc->sm->sm_ino) + return 0; + + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { + ri->attr_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->attr_extents++; + + return 0; + } + + ri->data_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->data_extents++; + + return 0; +} + +/* Count extents and blocks for an inode from all AG rmap data. */ +STATIC int +xrep_dinode_count_ag_rmaps( + struct xrep_inode *ri, + struct xfs_perag *pag) +{ + struct xfs_btree_cur *cur; + struct xfs_buf *agf; + int error; + + error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); + if (error) + return error; + + cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); + error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(ri->sc->tp, agf); + return error; +} + +/* Count extents and blocks for a given inode from all rmap data. */ +STATIC int +xrep_dinode_count_rmaps( + struct xrep_inode *ri) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error; + + if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) + return -EOPNOTSUPP; + + for_each_perag(ri->sc->mp, agno, pag) { + error = xrep_dinode_count_ag_rmaps(ri, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + /* Can't have extents on both the rt and the data device. */ + if (ri->data_extents && ri->rt_extents) + return -EFSCORRUPTED; + + trace_xrep_dinode_count_rmaps(ri->sc, + ri->data_blocks, ri->rt_blocks, ri->attr_blocks, + ri->data_extents, ri->rt_extents, ri->attr_extents); + return 0; +} + +/* Return true if this extents-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_extents_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmbt_irec new; + struct xfs_bmbt_rec *dp; + xfs_extnum_t nex; + bool isrt; + unsigned int i; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + dp = XFS_DFORK_PTR(dip, whichfork); + + isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); + for (i = 0; i < nex; i++, dp++) { + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(dp, &new); + fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, + &new); + if (fa) + return true; + } + + return false; +} + +/* Return true if this btree-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_bmbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmdr_block *dfp; + xfs_extnum_t nex; + unsigned int i; + unsigned int dmxr; + unsigned int nrecs; + unsigned int level; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + if (dfork_size < sizeof(struct xfs_bmdr_block)) + return true; + + dfp = XFS_DFORK_PTR(dip, whichfork); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) + return true; + if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) + return true; + + dmxr = xfs_bmdr_maxrecs(dfork_size, 0); + for (i = 1; i <= nrecs; i++) { + struct xfs_bmbt_key *fkp; + xfs_bmbt_ptr_t *fpp; + xfs_fileoff_t fileoff; + xfs_fsblock_t fsbno; + + fkp = XFS_BMDR_KEY_ADDR(dfp, i); + fileoff = be64_to_cpu(fkp->br_startoff); + if (!xfs_verify_fileoff(sc->mp, fileoff)) + return true; + + fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); + fsbno = be64_to_cpu(*fpp); + if (!xfs_verify_fsbno(sc->mp, fsbno)) + return true; + } + + return false; +} + +/* + * Check the data fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_dfork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + uint16_t mode) +{ + void *dfork_ptr; + int64_t data_size; + unsigned int fmt; + unsigned int dfork_size; + + /* + * Verifier functions take signed int64_t, so check for bogus negative + * values first. + */ + data_size = be64_to_cpu(dip->di_size); + if (data_size < 0) + return true; + + fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (fmt != XFS_DINODE_FMT_DEV) + return true; + break; + case S_IFREG: + if (fmt == XFS_DINODE_FMT_LOCAL) + return true; + fallthrough; + case S_IFLNK: + case S_IFDIR: + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return true; + } + break; + default: + return true; + } + + dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); + dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + switch (fmt) { + case XFS_DINODE_FMT_DEV: + break; + case XFS_DINODE_FMT_LOCAL: + /* dir/symlink structure cannot be larger than the fork */ + if (data_size > dfork_size) + return true; + /* directory structure must pass verification. */ + if (S_ISDIR(mode) && + xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) + return true; + /* symlink structure must pass verification. */ + if (S_ISLNK(mode) && + xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) + return true; + break; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +static void +xrep_dinode_set_data_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_nextents = cpu_to_be64(nextents); + else + dip->di_nextents = cpu_to_be32(nextents); +} + +static void +xrep_dinode_set_attr_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_anextents = cpu_to_be32(nextents); + else + dip->di_anextents = cpu_to_be16(nextents); +} + +/* Reset the data fork to something sane. */ +STATIC void +xrep_dinode_zap_dfork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_dfork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; + + xrep_dinode_set_data_nextents(dip, 0); + ri->data_blocks = 0; + ri->rt_blocks = 0; + + /* Special files always get reset to DEV */ + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + dip->di_format = XFS_DINODE_FMT_DEV; + dip->di_size = 0; + return; + } + + /* + * If we have data extents, reset to an empty map and hope the user + * will run the bmapbtd checker next. + */ + if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { + dip->di_format = XFS_DINODE_FMT_EXTENTS; + return; + } + + /* Otherwise, reset the local format to the minimum. */ + switch (mode & S_IFMT) { + case S_IFLNK: + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* + * Check the attr fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_afork( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_attr_sf_hdr *afork_ptr; + size_t attr_size; + unsigned int afork_size; + + if (XFS_DFORK_BOFF(dip) == 0) + return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || + xfs_dfork_attr_extents(dip) != 0; + + afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + + switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { + case XFS_DINODE_FMT_LOCAL: + /* Fork has to be large enough to extract the xattr size. */ + if (afork_size < sizeof(struct xfs_attr_sf_hdr)) + return true; + + /* xattr structure cannot be larger than the fork */ + attr_size = be16_to_cpu(afork_ptr->totsize); + if (attr_size > afork_size) + return true; + + /* xattr structure must pass verification. */ + return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +/* + * Reset the attr fork to empty. Since the attr fork could have contained + * ACLs, make the file readable only by root. + */ +STATIC void +xrep_dinode_zap_afork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_afork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; + + dip->di_aformat = XFS_DINODE_FMT_EXTENTS; + xrep_dinode_set_attr_nextents(dip, 0); + ri->attr_blocks = 0; + + /* + * If the data fork is in btree format, removing the attr fork entirely + * might cause verifier failures if the next level down in the bmbt + * could now fit in the data fork area. + */ + if (dip->di_format != XFS_DINODE_FMT_BTREE) + dip->di_forkoff = 0; + dip->di_mode = cpu_to_be16(mode & ~0777); + dip->di_uid = 0; + dip->di_gid = 0; +} + +/* Make sure the fork offset is a sensible value. */ +STATIC void +xrep_dinode_ensure_forkoff( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_bmdr_block *bmdr; + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t attr_extents, data_extents; + size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); + unsigned int lit_sz = XFS_LITINO(sc->mp); + unsigned int afork_min, dfork_min; + + trace_xrep_dinode_ensure_forkoff(sc, dip); + + /* + * Before calling this function, xrep_dinode_core ensured that both + * forks actually fit inside their respective literal areas. If this + * was not the case, the fork was reset to FMT_EXTENTS with zero + * records. If the rmapbt scan found attr or data fork blocks, this + * will be noted in the dinode_stats, and we must leave enough room + * for the bmap repair code to reconstruct the mapping structure. + * + * First, compute the minimum space required for the attr fork. + */ + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform xattr structure at all, that + * means the attr fork area was exactly large enough to fit + * the sf structure. + */ + afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_EXTENTS: + attr_extents = xfs_dfork_attr_extents(dip); + if (attr_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; + } else if (ri->attr_extents > 0) { + /* + * The attr fork thinks it has zero extents, but we + * found some xattr extents. We need to leave enough + * empty space here so that the incore attr fork will + * get created (and hence trigger the attr fork bmap + * repairer). + */ + afork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + afork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + break; + default: + /* We should never see any other formats. */ + afork_min = 0; + break; + } + + /* Compute the minimum space required for the data fork. */ + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + dfork_min = sizeof(__be32); + break; + case XFS_DINODE_FMT_UUID: + dfork_min = sizeof(uuid_t); + break; + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform data fork at all, that means + * the data fork area was large enough to fit whatever was in + * there. + */ + dfork_min = be64_to_cpu(dip->di_size); + break; + case XFS_DINODE_FMT_EXTENTS: + data_extents = xfs_dfork_data_extents(dip); + if (data_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; + } else if (ri->data_extents > 0 || ri->rt_extents > 0) { + /* + * The data fork thinks it has zero extents, but we + * found some data extents. We need to leave enough + * empty space here so that the data fork bmap repair + * will recover the mappings. + */ + dfork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + dfork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + break; + default: + dfork_min = 0; + break; + } + + /* + * Round all values up to the nearest 8 bytes, because that is the + * precision of di_forkoff. + */ + afork_min = roundup(afork_min, 8); + dfork_min = roundup(dfork_min, 8); + bmdr_minsz = roundup(bmdr_minsz, 8); + + ASSERT(dfork_min <= lit_sz); + ASSERT(afork_min <= lit_sz); + + /* + * If the data fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork up. + */ + if (dip->di_format == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_data_extents(dip) == 0 && + (ri->data_extents > 0 || ri->rt_extents > 0) && + bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { + if (bmdr_minsz + afork_min > lit_sz) { + /* + * The attr for and the stub fork we need to recover + * the data fork won't both fit. Zap the attr fork. + */ + xrep_dinode_zap_afork(ri, dip, mode); + afork_min = bmdr_minsz; + } else { + void *before, *after; + + /* Otherwise, just slide the attr fork up. */ + before = XFS_DFORK_APTR(dip); + dip->di_forkoff = bmdr_minsz >> 3; + after = XFS_DFORK_APTR(dip); + memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); + } + } + + /* + * If the attr fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork down. + */ + if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_attr_extents(dip) == 0 && + ri->attr_extents > 0 && + bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { + if (dip->di_format == XFS_DINODE_FMT_BTREE) { + /* + * If the data fork is in btree format then we can't + * adjust forkoff because that runs the risk of + * violating the extents/btree format transition rules. + */ + } else if (bmdr_minsz + dfork_min > lit_sz) { + /* + * If we can't move the attr fork, too bad, we lose the + * attr fork and leak its blocks. + */ + xrep_dinode_zap_afork(ri, dip, mode); + } else { + /* + * Otherwise, just slide the attr fork down. The attr + * fork is empty, so we don't have any old contents to + * move here. + */ + dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; + } + } +} + +/* + * Zap the data/attr forks if we spot anything that isn't going to pass the + * ifork verifiers or the ifork formatters, because we need to get the inode + * into good enough shape that the higher level repair functions can run. + */ +STATIC void +xrep_dinode_zap_forks( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t data_extents; + xfs_extnum_t attr_extents; + xfs_filblks_t nblocks; + uint16_t mode; + bool zap_datafork = false; + bool zap_attrfork = ri->zap_acls; + + trace_xrep_dinode_zap_forks(sc, dip); + + mode = be16_to_cpu(dip->di_mode); + + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + + /* Inode counters don't make sense? */ + if (data_extents > nblocks) + zap_datafork = true; + if (attr_extents > nblocks) + zap_attrfork = true; + if (data_extents + attr_extents > nblocks) + zap_datafork = zap_attrfork = true; + + if (!zap_datafork) + zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); + if (!zap_attrfork) + zap_attrfork = xrep_dinode_check_afork(sc, dip); + + /* Zap whatever's bad. */ + if (zap_attrfork) + xrep_dinode_zap_afork(ri, dip, mode); + if (zap_datafork) + xrep_dinode_zap_dfork(ri, dip, mode); + xrep_dinode_ensure_forkoff(ri, dip, mode); + + /* + * Zero di_nblocks if we don't have any extents at all to satisfy the + * buffer verifier. + */ + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + if (data_extents + attr_extents == 0) + dip->di_nblocks = 0; +} + +/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ +STATIC int +xrep_dinode_core( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_buf *bp; + struct xfs_dinode *dip; + xfs_ino_t ino = sc->sm->sm_ino; + int error; + int iget_error; + + /* Figure out what this inode had mapped in both forks. */ + error = xrep_dinode_count_rmaps(ri); + if (error) + return error; + + /* Read the inode cluster buffer. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, + ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, + NULL); + if (error) + return error; + + /* Make sure we can pass the inode buffer verifier. */ + xrep_dinode_buf(sc, bp); + bp->b_ops = &xfs_inode_buf_ops; + + /* Fix everything the verifier will complain about. */ + dip = xfs_buf_offset(bp, ri->imap.im_boffset); + xrep_dinode_header(sc, dip); + xrep_dinode_mode(ri, dip); + xrep_dinode_flags(sc, dip, ri->rt_extents > 0); + xrep_dinode_size(ri, dip); + xrep_dinode_extsize_hints(sc, dip); + xrep_dinode_zap_forks(ri, dip); + + /* Write out the inode. */ + trace_xrep_dinode_fixed(sc, dip); + xfs_dinode_calc_crc(sc->mp, dip); + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, + ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); + + /* + * In theory, we've fixed the ondisk inode record enough that we should + * be able to load the inode into the cache. Try to iget that inode + * now while we hold the AGI and the inode cluster buffer and take the + * IOLOCK so that we can continue with repairs without anyone else + * accessing the inode. If iget fails, we still need to commit the + * changes. + */ + iget_error = xchk_iget(sc, ino, &sc->ip); + if (!iget_error) + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + /* + * Commit the inode cluster buffer updates and drop the AGI buffer that + * we've been holding since scrub setup. From here on out, repairs + * deal only with the cached inode. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + if (iget_error) + return iget_error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + error = xrep_ino_dqattach(sc); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (ri->ino_sick_mask) + xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); + return 0; +} + +/* Fix everything xfs_dinode_verify cares about. */ +STATIC int +xrep_dinode_problems( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + error = xrep_dinode_core(ri); + if (error) + return error; + + /* We had to fix a totally busted inode, schedule quotacheck. */ + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + + return 0; +} + +/* + * Fix problems that the verifiers don't care about. In general these are + * errors that don't cause problems elsewhere in the kernel that we can easily + * detect, so we don't check them all that rigorously. + */ + +/* Make sure block and extent counts are ok. */ +STATIC int +xrep_inode_blockcounts( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + xfs_filblks_t count; + xfs_filblks_t acount; + xfs_extnum_t nextents; + int error; + + trace_xrep_inode_blockcounts(sc); + + /* Set data fork counters from the data fork mappings. */ + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, + &nextents, &count); + if (error) + return error; + if (xfs_is_reflink_inode(sc->ip)) { + /* + * data fork blockcount can exceed physical storage if a user + * reflinks the same block over and over again. + */ + ; + } else if (XFS_IS_REALTIME_INODE(sc->ip)) { + if (count >= sc->mp->m_sb.sb_rblocks) + return -EFSCORRUPTED; + } else { + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + } + error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); + if (error) + return error; + sc->ip->i_df.if_nextents = nextents; + + /* Set attr fork counters from the attr fork mappings. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + if (ifp) { + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, + &nextents, &acount); + if (error) + return error; + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, + nextents); + if (error) + return error; + ifp->if_nextents = nextents; + } else { + acount = 0; + } + + sc->ip->i_nblocks = count + acount; + return 0; +} + +/* Check for invalid uid/gid/prid. */ +STATIC void +xrep_inode_ids( + struct xfs_scrub *sc) +{ + bool dirty = false; + + trace_xrep_inode_ids(sc); + + if (!uid_valid(VFS_I(sc->ip)->i_uid)) { + i_uid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + } + + if (!gid_valid(VFS_I(sc->ip)->i_gid)) { + i_gid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + } + + if (sc->ip->i_projid == -1U) { + sc->ip->i_projid = 0; + dirty = true; + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + } + + /* strip setuid/setgid if we touched any of the ids */ + if (dirty) + VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); +} + +static inline void +xrep_clamp_timestamp( + struct xfs_inode *ip, + struct timespec64 *ts) +{ + ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); + *ts = timestamp_truncate(*ts, VFS_I(ip)); +} + +/* Nanosecond counters can't have more than 1 billion. */ +STATIC void +xrep_inode_timestamps( + struct xfs_inode *ip) +{ + struct timespec64 tstamp; + struct inode *inode = VFS_I(ip); + + tstamp = inode_get_atime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_atime_to_ts(inode, tstamp); + + tstamp = inode_get_mtime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_mtime_to_ts(inode, tstamp); + + tstamp = inode_get_ctime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_ctime_to_ts(inode, tstamp); + + xrep_clamp_timestamp(ip, &ip->i_crtime); +} + +/* Fix inode flags that don't make sense together. */ +STATIC void +xrep_inode_flags( + struct xfs_scrub *sc) +{ + uint16_t mode; + + trace_xrep_inode_flags(sc); + + mode = VFS_I(sc->ip)->i_mode; + + /* Clear junk flags */ + if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) + sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; + + /* NEWRTBM only applies to realtime bitmaps */ + if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) + sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; + else + sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; + + /* These only make sense for directories. */ + if (!S_ISDIR(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | + XFS_DIFLAG_EXTSZINHERIT | + XFS_DIFLAG_PROJINHERIT | + XFS_DIFLAG_NOSYMLINKS); + + /* These only make sense for files. */ + if (!S_ISREG(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | + XFS_DIFLAG_EXTSIZE); + + /* These only make sense for non-rt files. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; + + /* Immutable and append only? Drop the append. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && + (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) + sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; + + /* Clear junk flags. */ + if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; + + /* No reflink flag unless we support it and it's a file. */ + if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* DAX only applies to files and dirs. */ + if (!(S_ISREG(mode) || S_ISDIR(mode))) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; + + /* No reflink files on the realtime device. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; +} + +/* + * Fix size problems with block/node format directories. If we fail to find + * the extent list, just bail out and let the bmapbtd repair functions clean + * up that mess. + */ +STATIC void +xrep_inode_blockdir_size( + struct xfs_scrub *sc) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + xfs_fileoff_t off; + int error; + + trace_xrep_inode_blockdir_size(sc); + + error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); + if (error) + return; + + /* Find the last block before 32G; this is the dir size. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); + if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { + /* zero-extents directory? */ + return; + } + + off = got.br_startoff + got.br_blockcount; + sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, + XFS_FSB_TO_B(sc->mp, off)); +} + +/* Fix size problems with short format directories. */ +STATIC void +xrep_inode_sfdir_size( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + + trace_xrep_inode_sfdir_size(sc); + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + sc->ip->i_disk_size = ifp->if_bytes; +} + +/* + * Fix any irregularities in a directory inode's size now that we can iterate + * extent maps and access other regular inode data. + */ +STATIC void +xrep_inode_dir_size( + struct xfs_scrub *sc) +{ + trace_xrep_inode_dir_size(sc); + + switch (sc->ip->i_df.if_format) { + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + xrep_inode_blockdir_size(sc); + break; + case XFS_DINODE_FMT_LOCAL: + xrep_inode_sfdir_size(sc); + break; + } +} + +/* Fix extent size hint problems. */ +STATIC void +xrep_inode_extsize( + struct xfs_scrub *sc) +{ + /* Fix misaligned extent size hints on a directory. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { + sc->ip->i_extsize = 0; + sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; + } +} + +/* Fix any irregularities in an inode that the verifiers don't catch. */ +STATIC int +xrep_inode_problems( + struct xfs_scrub *sc) +{ + int error; + + error = xrep_inode_blockcounts(sc); + if (error) + return error; + xrep_inode_timestamps(sc->ip); + xrep_inode_flags(sc); + xrep_inode_ids(sc); + /* + * We can now do a better job fixing the size of a directory now that + * we can scan the data fork extents than we could in xrep_dinode_size. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + xrep_inode_dir_size(sc); + xrep_inode_extsize(sc); + + trace_xrep_inode_fixed(sc); + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair an inode's fields. */ +int +xrep_inode( + struct xfs_scrub *sc) +{ + int error = 0; + + /* + * No inode? That means we failed the _iget verifiers. Repair all + * the things that the inode verifiers care about, then retry _iget. + */ + if (!sc->ip) { + struct xrep_inode *ri = sc->buf; + + ASSERT(ri != NULL); + + error = xrep_dinode_problems(ri); + if (error) + return error; + + /* By this point we had better have a working incore inode. */ + if (!sc->ip) + return -EFSCORRUPTED; + } + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* If we found corruption of any kind, try to fix it. */ + if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { + error = xrep_inode_problems(sc); + if (error) + return error; + } + + /* See if we can clear the reflink flag. */ + if (xfs_is_reflink_inode(sc->ip)) { + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + return xrep_defer_finish(sc); +} diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c new file mode 100644 index 000000000000..bb6d980b4fcd --- /dev/null +++ b/fs/xfs/scrub/newbt.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_defer.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/newbt.h" + +/* + * Estimate proper slack values for a btree that's being reloaded. + * + * Under most circumstances, we'll take whatever default loading value the + * btree bulk loading code calculates for us. However, there are some + * exceptions to this rule: + * + * (0) If someone turned one of the debug knobs. + * (1) If this is a per-AG btree and the AG has less than 10% space free. + * (2) If this is an inode btree and the FS has less than 10% space free. + + * In either case, format the new btree blocks almost completely full to + * minimize space usage. + */ +static void +xrep_newbt_estimate_slack( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_btree_bload *bload = &xnr->bload; + uint64_t free; + uint64_t sz; + + /* + * The xfs_globals values are set to -1 (i.e. take the bload defaults) + * unless someone has set them otherwise, so we just pull the values + * here. + */ + bload->leaf_slack = xfs_globals.bload_leaf_slack; + bload->node_slack = xfs_globals.bload_node_slack; + + if (sc->ops->type == ST_PERAG) { + free = sc->sa.pag->pagf_freeblks; + sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); + } else { + free = percpu_counter_sum(&sc->mp->m_fdblocks); + sz = sc->mp->m_sb.sb_dblocks; + } + + /* No further changes if there's more than 10% free space left. */ + if (free >= div_u64(sz, 10)) + return; + + /* + * We're low on space; load the btrees as tightly as possible. Leave + * a couple of open slots in each btree block so that we don't end up + * splitting the btrees like crazy after a mount. + */ + if (bload->leaf_slack < 0) + bload->leaf_slack = 2; + if (bload->node_slack < 0) + bload->node_slack = 2; +} + +/* Initialize accounting resources for staging a new AG btree. */ +void +xrep_newbt_init_ag( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, + xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv) +{ + memset(xnr, 0, sizeof(struct xrep_newbt)); + xnr->sc = sc; + xnr->oinfo = *oinfo; /* structure copy */ + xnr->alloc_hint = alloc_hint; + xnr->resv = resv; + INIT_LIST_HEAD(&xnr->resv_list); + xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ + xrep_newbt_estimate_slack(xnr); +} + +/* Initialize accounting resources for staging a new inode fork btree. */ +int +xrep_newbt_init_inode( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + int whichfork, + const struct xfs_owner_info *oinfo) +{ + struct xfs_ifork *ifp; + + ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); + if (!ifp) + return -ENOMEM; + + xrep_newbt_init_ag(xnr, sc, oinfo, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), + XFS_AG_RESV_NONE); + xnr->ifake.if_fork = ifp; + xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); + return 0; +} + +/* + * Initialize accounting resources for staging a new btree. Callers are + * expected to add their own reservations (and clean them up) manually. + */ +void +xrep_newbt_init_bare( + struct xrep_newbt *xnr, + struct xfs_scrub *sc) +{ + xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, + XFS_AG_RESV_NONE); +} + +/* + * Designate specific blocks to be used to build our new btree. @pag must be + * a passive reference. + */ +STATIC int +xrep_newbt_add_blocks( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + const struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xrep_newbt_resv *resv; + int error; + + resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); + if (!resv) + return -ENOMEM; + + INIT_LIST_HEAD(&resv->list); + resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + resv->len = args->len; + resv->used = 0; + resv->pag = xfs_perag_hold(pag); + + if (args->tp) { + ASSERT(xnr->oinfo.oi_offset == 0); + + error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); + if (error) + goto out_pag; + } + + list_add_tail(&resv->list, &xnr->resv_list); + return 0; +out_pag: + xfs_perag_put(resv->pag); + kfree(resv); + return error; +} + +/* + * Add an extent to the new btree reservation pool. Callers are required to + * reap this reservation manually if the repair is cancelled. @pag must be a + * passive reference. + */ +int +xrep_newbt_add_extent( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xfs_alloc_arg args = { + .tp = NULL, /* no autoreap */ + .oinfo = xnr->oinfo, + .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), + .len = len, + .resv = xnr->resv, + }; + + return xrep_newbt_add_blocks(xnr, pag, &args); +} + +/* Don't let our allocation hint take us beyond this AG */ +static inline void +xrep_newbt_validate_ag_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); + + if (agno == sc->sa.pag->pag_agno && + xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for a new per-AG btree. */ +STATIC int +xrep_newbt_alloc_ag_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + ASSERT(sc->sa.pag != NULL); + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + xfs_agnumber_t agno; + + xrep_newbt_validate_ag_alloc_hint(xnr); + + error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_ag_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + if (agno != sc->sa.pag->pag_agno) { + ASSERT(agno == sc->sa.pag->pag_agno); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Don't let our allocation hint take us beyond EOFS */ +static inline void +xrep_newbt_validate_file_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + + if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for our new file-based btree. */ +STATIC int +xrep_newbt_alloc_file_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xrep_newbt_validate_file_alloc_hint(xnr); + + error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_file_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + pag = xfs_perag_get(mp, agno); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, pag, &args); + xfs_perag_put(pag); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Allocate disk space for our new btree. */ +int +xrep_newbt_alloc_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + if (xnr->sc->ip) + return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); + return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); +} + +/* + * Free the unused part of a space extent that was reserved for a new ondisk + * structure. Returns the number of EFIs logged or a negative errno. + */ +STATIC int +xrep_newbt_free_extent( + struct xrep_newbt *xnr, + struct xrep_newbt_resv *resv, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agblock_t free_agbno = resv->agbno; + xfs_extlen_t free_aglen = resv->len; + xfs_fsblock_t fsbno; + int error; + + if (!btree_committed || resv->used == 0) { + /* + * If we're not committing a new btree or we didn't use the + * space reservation, let the existing EFI free the entire + * space extent. + */ + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, + free_agbno, free_aglen, xnr->oinfo.oi_owner); + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + return 1; + } + + /* + * We used space and committed the btree. Cancel the autoreap, remove + * the written blocks from the reservation, and possibly log a new EFI + * to free any unused reservation space. + */ + xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); + free_agbno += resv->used; + free_aglen -= resv->used; + + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, xnr->oinfo.oi_owner); + + ASSERT(xnr->resv != XFS_AG_RESV_AGFL); + ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); + + /* + * Use EFIs to free the reservations. This reduces the chance + * that we leak blocks if the system goes down. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); + error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, + xnr->resv, true); + if (error) + return error; + + return 1; +} + +/* Free all the accounting info and disk space we reserved for a new btree. */ +STATIC int +xrep_newbt_free( + struct xrep_newbt *xnr, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + struct xrep_newbt_resv *resv, *n; + unsigned int freed = 0; + int error = 0; + + /* + * If the filesystem already went down, we can't free the blocks. Skip + * ahead to freeing the incore metadata because we can't fix anything. + */ + if (xfs_is_shutdown(sc->mp)) + goto junkit; + + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + int ret; + + ret = xrep_newbt_free_extent(xnr, resv, btree_committed); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + if (ret < 0) { + error = ret; + goto junkit; + } + + freed += ret; + if (freed >= XREP_MAX_ITRUNCATE_EFIS) { + error = xrep_defer_finish(sc); + if (error) + goto junkit; + freed = 0; + } + } + + if (freed) + error = xrep_defer_finish(sc); + +junkit: + /* + * If we still have reservations attached to @newbt, cleanup must have + * failed and the filesystem is about to go down. Clean up the incore + * reservations and try to commit to freeing the space we used. + */ + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + } + + if (sc->ip) { + kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); + xnr->ifake.if_fork = NULL; + } + + return error; +} + +/* + * Free all the accounting info and unused disk space allocations after + * committing a new btree. + */ +int +xrep_newbt_commit( + struct xrep_newbt *xnr) +{ + return xrep_newbt_free(xnr, true); +} + +/* + * Free all the accounting info and all of the disk space we reserved for a new + * btree that we're not going to commit. We want to try to roll things back + * cleanly for things like ENOSPC midway through allocation. + */ +void +xrep_newbt_cancel( + struct xrep_newbt *xnr) +{ + xrep_newbt_free(xnr, false); +} + +/* Feed one of the reserved btree blocks to the bulk loader. */ +int +xrep_newbt_claim_block( + struct xfs_btree_cur *cur, + struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr) +{ + struct xrep_newbt_resv *resv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + + /* + * The first item in the list should always have a free block unless + * we're completely out. + */ + resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); + if (resv->used == resv->len) + return -ENOSPC; + + /* + * Peel off a block from the start of the reservation. We allocate + * blocks in order to place blocks on disk in increasing record or key + * order. The block reservations tend to end up on the list in + * decreasing order, which hopefully results in leaf blocks ending up + * together. + */ + agbno = resv->agbno + resv->used; + resv->used++; + + /* If we used all the blocks in this reservation, move it to the end. */ + if (resv->used == resv->len) + list_move_tail(&resv->list, &xnr->resv_list); + + trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, + xnr->oinfo.oi_owner); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, + agbno)); + else + ptr->s = cpu_to_be32(agbno); + + /* Relog all the EFIs. */ + return xrep_defer_finish(xnr->sc); +} + +/* How many reserved blocks are unused? */ +unsigned int +xrep_newbt_unused_blocks( + struct xrep_newbt *xnr) +{ + struct xrep_newbt_resv *resv; + unsigned int unused = 0; + + list_for_each_entry(resv, &xnr->resv_list, list) + unused += resv->len - resv->used; + return unused; +} diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h new file mode 100644 index 000000000000..89f8e3970b1f --- /dev/null +++ b/fs/xfs/scrub/newbt.h @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_NEWBT_H__ +#define __XFS_SCRUB_NEWBT_H__ + +struct xrep_newbt_resv { + /* Link to list of extents that we've reserved. */ + struct list_head list; + + struct xfs_perag *pag; + + /* Auto-freeing this reservation if we don't commit. */ + struct xfs_alloc_autoreap autoreap; + + /* AG block of the extent we reserved. */ + xfs_agblock_t agbno; + + /* Length of the reservation. */ + xfs_extlen_t len; + + /* How much of this reservation has been used. */ + xfs_extlen_t used; +}; + +struct xrep_newbt { + struct xfs_scrub *sc; + + /* List of extents that we've reserved. */ + struct list_head resv_list; + + /* Fake root for new btree. */ + union { + struct xbtree_afakeroot afake; + struct xbtree_ifakeroot ifake; + }; + + /* rmap owner of these blocks */ + struct xfs_owner_info oinfo; + + /* btree geometry for the bulk loader */ + struct xfs_btree_bload bload; + + /* Allocation hint */ + xfs_fsblock_t alloc_hint; + + /* per-ag reservation type */ + enum xfs_ag_resv_type resv; +}; + +void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc); +void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv); +int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, + int whichfork, const struct xfs_owner_info *oinfo); +int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); +int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len); +void xrep_newbt_cancel(struct xrep_newbt *xnr); +int xrep_newbt_commit(struct xrep_newbt *xnr); +int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr); +unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr); + +#endif /* __XFS_SCRUB_NEWBT_H__ */ diff --git a/fs/xfs/scrub/off_bitmap.h b/fs/xfs/scrub/off_bitmap.h new file mode 100644 index 000000000000..0d3f9e6c1aad --- /dev/null +++ b/fs/xfs/scrub/off_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_OFF_BITMAP_H__ +#define __XFS_SCRUB_OFF_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fileoff_t */ + +struct xoff_bitmap { + struct xbitmap64 offbitmap; +}; + +static inline void xoff_bitmap_init(struct xoff_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->offbitmap); +} + +static inline void xoff_bitmap_destroy(struct xoff_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->offbitmap); +} + +static inline int xoff_bitmap_set(struct xoff_bitmap *bitmap, + xfs_fileoff_t off, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->offbitmap, off, len); +} + +static inline int xoff_bitmap_walk(struct xoff_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->offbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_OFF_BITMAP_H__ */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index e6155d86f791..7db873672146 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -156,6 +156,16 @@ xchk_parent_validate( goto out_rele; } + /* + * We cannot yet validate this parent pointer if the directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + xchk_set_incomplete(sc); + goto out_unlock; + } + /* Look for a directory entry in the parent pointing to the child. */ error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) @@ -217,6 +227,13 @@ xchk_parent( */ error = xchk_parent_validate(sc, parent_ino); } while (error == -EAGAIN); + if (error == -EBUSY) { + /* + * We could not scan a directory, so we marked the check + * incomplete. No further error return is necessary. + */ + return 0; + } return error; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 5671c8153433..183d531875ea 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -6,6 +6,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" @@ -17,9 +18,10 @@ #include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/quota.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ -static inline xfs_dqtype_t +xfs_dqtype_t xchk_quota_to_dqtype( struct xfs_scrub *sc) { @@ -75,14 +77,70 @@ struct xchk_quota_info { xfs_dqid_t last_id; }; +/* There's a written block backing this dquot, right? */ +STATIC int +xchk_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + xfs_fileoff_t offset) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + if (!xfs_verify_fileoff(mp, offset)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (dq->q_fileoffset != offset) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps != 1) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (!xfs_verify_fsbno(mp, irec.br_startblock)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (XFS_FSB_TO_DADDR(mp, irec.br_startblock) != dq->q_blkno) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (!xfs_bmap_is_written_extent(&irec)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + return 0; +} + +/* Complain if a quota timer is incorrectly set. */ +static inline void +xchk_quota_item_timer( + struct xfs_scrub *sc, + xfs_fileoff_t offset, + const struct xfs_dquot_res *res) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } else { + if (res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } +} + /* Scrub the fields in an individual quota item. */ STATIC int xchk_quota_item( - struct xfs_dquot *dq, - xfs_dqtype_t dqtype, - void *priv) + struct xchk_quota_info *sqi, + struct xfs_dquot *dq) { - struct xchk_quota_info *sqi = priv; struct xfs_scrub *sc = sqi->sc; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -94,6 +152,17 @@ xchk_quota_item( return error; /* + * We want to validate the bmap record for the storage backing this + * dquot, so we need to lock the dquot and the quota file. For quota + * operations, the locking order is first the ILOCK and then the dquot. + * However, dqiterate gave us a locked dquot, so drop the dquot lock to + * get the ILOCK. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_SHARED); + xfs_dqlock(dq); + + /* * Except for the root dquot, the actual dquot we got must either have * the same or higher id as we saw before. */ @@ -103,6 +172,11 @@ xchk_quota_item( sqi->last_id = dq->q_id; + error = xchk_quota_item_bmap(sc, dq, offset); + xchk_iunlock(sc, XFS_ILOCK_SHARED); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) + return error; + /* * Warn if the hard limits are larger than the fs. * Administrators can do this, though in production this seems @@ -166,6 +240,10 @@ xchk_quota_item( dq->q_rtb.count > dq->q_rtb.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + xchk_quota_item_timer(sc, offset, &dq->q_blk); + xchk_quota_item_timer(sc, offset, &dq->q_ino); + xchk_quota_item_timer(sc, offset, &dq->q_rtb); + out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -ECANCELED; @@ -191,7 +269,7 @@ xchk_quota_data_fork( return error; /* Check for data fork problems that apply only to quota files. */ - max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) @@ -218,9 +296,11 @@ int xchk_quota( struct xfs_scrub *sc) { - struct xchk_quota_info sqi; + struct xchk_dqiter cursor = { }; + struct xchk_quota_info sqi = { .sc = sc }; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_dquot *dq; xfs_dqtype_t dqtype; int error = 0; @@ -239,10 +319,15 @@ xchk_quota( * functions. */ xchk_iunlock(sc, sc->ilock_flags); - sqi.sc = sc; - sqi.last_id = 0; - error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); - xchk_ilock(sc, XFS_ILOCK_EXCL); + + /* Now look for things that the quota verifiers won't complain about. */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xchk_quota_item(&sqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } if (error == -ECANCELED) error = 0; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h new file mode 100644 index 000000000000..6c7134ce2385 --- /dev/null +++ b/fs/xfs/scrub/quota.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_QUOTA_H__ +#define __XFS_SCRUB_QUOTA_H__ + +xfs_dqtype_t xchk_quota_to_dqtype(struct xfs_scrub *sc); + +/* dquot iteration code */ + +struct xchk_dqiter { + struct xfs_scrub *sc; + + /* Quota file that we're walking. */ + struct xfs_inode *quota_ip; + + /* Cached data fork mapping for the dquot. */ + struct xfs_bmbt_irec bmap; + + /* The next dquot to scan. */ + uint64_t id; + + /* Quota type (user/group/project). */ + xfs_dqtype_t dqtype; + + /* Data fork sequence number to detect stale mappings. */ + unsigned int if_seq; +}; + +void xchk_dqiter_init(struct xchk_dqiter *cursor, struct xfs_scrub *sc, + xfs_dqtype_t dqtype); +int xchk_dquot_iter(struct xchk_dqiter *cursor, struct xfs_dquot **dqpp); + +#endif /* __XFS_SCRUB_QUOTA_H__ */ diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c new file mode 100644 index 000000000000..0bab4c30cb85 --- /dev/null +++ b/fs/xfs/scrub/quota_repair.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dquot.h" +#include "xfs_dquot_item.h" +#include "xfs_reflink.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Quota Repair + * ============ + * + * Quota repairs are fairly simplistic; we fix everything that the dquot + * verifiers complain about, cap any counters or limits that make no sense, + * and schedule a quotacheck if we had to fix anything. We also repair any + * data fork extent records that don't apply to metadata files. + */ + +struct xrep_quota_info { + struct xfs_scrub *sc; + bool need_quotacheck; +}; + +/* + * Allocate a new block into a sparse hole in the quota file backing this + * dquot, initialize the block, and commit the whole mess. + */ +STATIC int +xrep_quota_item_fill_bmap_hole( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + struct xfs_bmbt_irec *irec) +{ + struct xfs_buf *bp; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Map a block into the file. */ + error = xfs_trans_reserve_more(sc->tp, XFS_QM_DQALLOC_SPACE_RES(mp), + 0); + if (error) + return error; + + error = xfs_bmapi_write(sc->tp, sc->ip, dq->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, + irec, &nmaps); + if (error) + return error; + if (nmaps != 1) + return -ENOSPC; + + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock); + + trace_xrep_dquot_item_fill_bmap_hole(sc->mp, dq->q_type, dq->q_id); + + /* Initialize the new block. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, dq->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + return error; + bp->b_ops = &xfs_dquot_buf_ops; + + xfs_qm_init_dquot_blk(sc->tp, dq->q_id, dq->q_type, bp); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + + /* + * Finish the mapping transactions and roll one more time to + * disconnect sc->ip from sc->tp. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + return xfs_trans_roll(&sc->tp); +} + +/* Make sure there's a written block backing this dquot */ +STATIC int +xrep_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + bool *dirty) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + xfs_fileoff_t offset = dq->q_id / qi->qi_dqperchunk; + int nmaps = 1; + int error; + + /* The computed file offset should always be valid. */ + if (!xfs_verify_fileoff(mp, offset)) { + ASSERT(xfs_verify_fileoff(mp, offset)); + return -EFSCORRUPTED; + } + dq->q_fileoffset = offset; + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps < 1 || !xfs_bmap_is_real_extent(&irec)) { + /* Hole/delalloc extent; allocate a real block. */ + error = xrep_quota_item_fill_bmap_hole(sc, dq, &irec); + if (error) + return error; + } else if (irec.br_state != XFS_EXT_NORM) { + /* Unwritten extent, which we already took care of? */ + ASSERT(irec.br_state == XFS_EXT_NORM); + return -EFSCORRUPTED; + } else if (dq->q_blkno != XFS_FSB_TO_DADDR(mp, irec.br_startblock)) { + /* + * If the cached daddr is incorrect, repair probably punched a + * hole out of the quota file and filled it back in with a new + * block. Update the block mapping in the dquot. + */ + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec.br_startblock); + } + + *dirty = true; + return 0; +} + +/* Reset quota timers if incorrectly set. */ +static inline void +xrep_quota_item_timer( + struct xfs_scrub *sc, + const struct xfs_dquot_res *res, + bool *dirty) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + *dirty = true; + } else { + if (res->timer) + *dirty = true; + } +} + +/* Scrub the fields in an individual quota item. */ +STATIC int +xrep_quota_item( + struct xrep_quota_info *rqi, + struct xfs_dquot *dq) +{ + struct xfs_scrub *sc = rqi->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t fs_icount; + bool dirty = false; + int error = 0; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * We might need to fix holes in the bmap record for the storage + * backing this dquot, so we need to lock the dquot and the quota file. + * dqiterate gave us a locked dquot, so drop the dquot lock to get the + * ILOCK_EXCL. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_dqlock(dq); + + error = xrep_quota_item_bmap(sc, dq, &dirty); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + if (error) + return error; + + /* Check the limits. */ + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) { + dq->q_blk.softlimit = dq->q_blk.hardlimit; + dirty = true; + } + + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) { + dq->q_ino.softlimit = dq->q_ino.hardlimit; + dirty = true; + } + + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) { + dq->q_rtb.softlimit = dq->q_rtb.hardlimit; + dirty = true; + } + + /* + * Check that usage doesn't exceed physical limits. However, on + * a reflink filesystem we're allowed to exceed physical space + * if there are no quota limits. We don't know what the real number + * is, but we can make quotacheck find out for us. + */ + if (!xfs_has_reflink(mp) && dq->q_blk.count > mp->m_sb.sb_dblocks) { + dq->q_blk.reserved -= dq->q_blk.count; + dq->q_blk.reserved += mp->m_sb.sb_dblocks; + dq->q_blk.count = mp->m_sb.sb_dblocks; + rqi->need_quotacheck = true; + dirty = true; + } + fs_icount = percpu_counter_sum(&mp->m_icount); + if (dq->q_ino.count > fs_icount) { + dq->q_ino.reserved -= dq->q_ino.count; + dq->q_ino.reserved += fs_icount; + dq->q_ino.count = fs_icount; + rqi->need_quotacheck = true; + dirty = true; + } + if (dq->q_rtb.count > mp->m_sb.sb_rblocks) { + dq->q_rtb.reserved -= dq->q_rtb.count; + dq->q_rtb.reserved += mp->m_sb.sb_rblocks; + dq->q_rtb.count = mp->m_sb.sb_rblocks; + rqi->need_quotacheck = true; + dirty = true; + } + + xrep_quota_item_timer(sc, &dq->q_blk, &dirty); + xrep_quota_item_timer(sc, &dq->q_ino, &dirty); + xrep_quota_item_timer(sc, &dq->q_rtb, &dirty); + + if (!dirty) + return 0; + + trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id); + + dq->q_flags |= XFS_DQFLAG_DIRTY; + xfs_trans_dqjoin(sc->tp, dq); + if (dq->q_id) { + xfs_qm_adjust_dqlimits(dq); + xfs_qm_adjust_dqtimers(dq); + } + xfs_trans_log_dquot(sc->tp, dq); + error = xfs_trans_roll(&sc->tp); + xfs_dqlock(dq); + return error; +} + +/* Fix a quota timer so that we can pass the verifier. */ +STATIC void +xrep_quota_fix_timer( + struct xfs_mount *mp, + const struct xfs_disk_dquot *ddq, + __be64 softlimit, + __be64 countnow, + __be32 *timer, + time64_t timelimit) +{ + uint64_t soft = be64_to_cpu(softlimit); + uint64_t count = be64_to_cpu(countnow); + time64_t new_timer; + uint32_t t; + + if (!soft || count <= soft || *timer != 0) + return; + + new_timer = xfs_dquot_set_timeout(mp, + ktime_get_real_seconds() + timelimit); + if (ddq->d_type & XFS_DQTYPE_BIGTIME) + t = xfs_dq_unix_to_bigtime(new_timer); + else + t = new_timer; + + *timer = cpu_to_be32(t); +} + +/* Fix anything the verifiers complain about. */ +STATIC int +xrep_quota_block( + struct xfs_scrub *sc, + xfs_daddr_t daddr, + xfs_dqtype_t dqtype, + xfs_dqid_t id) +{ + struct xfs_dqblk *dqblk; + struct xfs_disk_dquot *ddq; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_def_quota *defq = xfs_get_defquota(qi, dqtype); + struct xfs_buf *bp = NULL; + enum xfs_blft buftype = 0; + int i; + int error; + + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, daddr, + qi->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); + switch (error) { + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Failed verifier, retry read with no ops. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, + sc->mp->m_ddev_targp, daddr, qi->qi_dqchunklen, + 0, &bp, NULL); + if (error) + return error; + break; + case 0: + dqblk = bp->b_addr; + ddq = &dqblk[0].dd_diskdq; + + /* + * If there's nothing that would impede a dqiterate, we're + * done. + */ + if ((ddq->d_type & XFS_DQTYPE_REC_MASK) != dqtype || + id == be32_to_cpu(ddq->d_id)) { + xfs_trans_brelse(sc->tp, bp); + return 0; + } + break; + default: + return error; + } + + /* Something's wrong with the block, fix the whole thing. */ + dqblk = bp->b_addr; + bp->b_ops = &xfs_dquot_buf_ops; + for (i = 0; i < qi->qi_dqperchunk; i++, dqblk++) { + ddq = &dqblk->dd_diskdq; + + trace_xrep_disk_dquot(sc->mp, dqtype, id + i); + + ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + ddq->d_version = XFS_DQUOT_VERSION; + ddq->d_type = dqtype; + ddq->d_id = cpu_to_be32(id + i); + + if (xfs_has_bigtime(sc->mp) && ddq->d_id) + ddq->d_type |= XFS_DQTYPE_BIGTIME; + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_blk_softlimit, + ddq->d_bcount, &ddq->d_btimer, + defq->blk.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_ino_softlimit, + ddq->d_icount, &ddq->d_itimer, + defq->ino.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_rtb_softlimit, + ddq->d_rtbcount, &ddq->d_rtbtimer, + defq->rtb.time); + + /* We only support v5 filesystems so always set these. */ + uuid_copy(&dqblk->dd_uuid, &sc->mp->m_sb.sb_meta_uuid); + xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + dqblk->dd_lsn = 0; + } + switch (dqtype) { + case XFS_DQTYPE_USER: + buftype = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_DQTYPE_GROUP: + buftype = XFS_BLFT_GDQUOT_BUF; + break; + case XFS_DQTYPE_PROJ: + buftype = XFS_BLFT_PDQUOT_BUF; + break; + } + xfs_trans_buf_set_type(sc->tp, bp, buftype); + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); + return xrep_roll_trans(sc); +} + +/* + * Repair a quota file's data fork. The function returns with the inode + * joined. + */ +STATIC int +xrep_quota_data_fork( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xfs_bmbt_irec irec = { 0 }; + struct xfs_iext_cursor icur; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_ifork *ifp; + xfs_fileoff_t max_dqid_off; + xfs_fileoff_t off; + xfs_fsblock_t fsbno; + bool truncate = false; + bool joined = false; + int error = 0; + + error = xrep_metadata_inode_forks(sc); + if (error) + goto out; + + /* Check for data fork problems that apply only to quota files. */ + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &irec) { + if (isnullstartblock(irec.br_startblock)) { + error = -EFSCORRUPTED; + goto out; + } + + if (irec.br_startoff > max_dqid_off || + irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { + truncate = true; + break; + } + + /* Convert unwritten extents to real ones. */ + if (irec.br_state == XFS_EXT_UNWRITTEN) { + struct xfs_bmbt_irec nrec; + int nmap = 1; + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + error = xfs_bmapi_write(sc->tp, sc->ip, + irec.br_startoff, irec.br_blockcount, + XFS_BMAPI_CONVERT, 0, &nrec, &nmap); + if (error) + goto out; + if (nmap != 1) { + error = -ENOSPC; + goto out; + } + ASSERT(nrec.br_startoff == irec.br_startoff); + ASSERT(nrec.br_blockcount == irec.br_blockcount); + + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + } + } + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + if (truncate) { + /* Erase everything after the block containing the max dquot */ + error = xfs_bunmapi_range(&sc->tp, sc->ip, 0, + max_dqid_off * sc->mp->m_sb.sb_blocksize, + XFS_MAX_FILEOFF); + if (error) + goto out; + + /* Remove all CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(sc->ip, &sc->tp, 0, + XFS_MAX_FILEOFF, true); + if (error) + goto out; + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* + * Always re-log the inode so that our permanent transaction + * can keep on rolling it forward in the log. + */ + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + } + + /* Now go fix anything that fails the verifiers. */ + for_each_xfs_iext(ifp, &icur, &irec) { + for (fsbno = irec.br_startblock, off = irec.br_startoff; + fsbno < irec.br_startblock + irec.br_blockcount; + fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB, + off += XFS_DQUOT_CLUSTER_SIZE_FSB) { + error = xrep_quota_block(sc, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + dqtype, off * qi->qi_dqperchunk); + if (error) + goto out; + } + } + +out: + return error; +} + +/* + * Go fix anything in the quota items that we could have been mad about. Now + * that we've checked the quota inode data fork we have to drop ILOCK_EXCL to + * use the regular dquot functions. + */ +STATIC int +xrep_quota_problems( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xrep_quota_info rqi = { .sc = sc }; + struct xfs_dquot *dq; + int error; + + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xrep_quota_item(&rqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* Make a quotacheck happen. */ + if (rqi.need_quotacheck) + xrep_force_quotacheck(sc, dqtype); + return 0; +} + +/* Repair all of a quota type's items. */ +int +xrep_quota( + struct xfs_scrub *sc) +{ + xfs_dqtype_t dqtype; + int error; + + dqtype = xchk_quota_to_dqtype(sc); + + /* + * Re-take the ILOCK so that we can fix any problems that we found + * with the data fork mappings, or with the dquot bufs themselves. + */ + if (!(sc->ilock_flags & XFS_ILOCK_EXCL)) + xchk_ilock(sc, XFS_ILOCK_EXCL); + error = xrep_quota_data_fork(sc, dqtype); + if (error) + return error; + + /* + * Finish deferred items and roll the transaction to unjoin the quota + * inode from transaction so that we can unlock the quota inode; we + * play only with dquots from now on. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + xchk_iunlock(sc, sc->ilock_flags); + + /* Fix anything the dquot verifiers don't complain about. */ + error = xrep_quota_problems(sc, dqtype); + if (error) + return error; + + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c index e51c1544be63..16462332c897 100644 --- a/fs/xfs/scrub/readdir.c +++ b/fs/xfs/scrub/readdir.c @@ -36,16 +36,14 @@ xchk_dir_walk_sf( struct xfs_mount *mp = dp->i_mount; struct xfs_da_geometry *geo = mp->m_dir_geo; struct xfs_dir2_sf_entry *sfep; - struct xfs_dir2_sf_hdr *sfp; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_ino_t ino; xfs_dir2_dataptr_t dapos; unsigned int i; int error; ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); /* dot entry */ dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 86a62420e02c..f99eca799809 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -20,6 +20,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" @@ -31,11 +32,14 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_remote.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/fsb_bitmap.h" #include "scrub/reap.h" /* @@ -73,10 +77,10 @@ * with only the same rmap owner but the block is not owned by something with * the same rmap owner, the block will be freed. * - * The caller is responsible for locking the AG headers for the entire rebuild - * operation so that nothing else can sneak in and change the AG state while - * we're not looking. We must also invalidate any buffers associated with - * @bitmap. + * The caller is responsible for locking the AG headers/inode for the entire + * rebuild operation so that nothing else can sneak in and change the incore + * state while we're not looking. We must also invalidate any buffers + * associated with @bitmap. */ /* Information about reaping extents after a repair. */ @@ -247,7 +251,7 @@ xreap_agextent_binval( max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); - for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) { + for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) { struct xfs_buf *bp = NULL; xfs_daddr_t daddr; int error; @@ -377,6 +381,17 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); rs->force_roll = true; + + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + /* + * If we're unmapping CoW staging extents, remove the + * records from the refcountbt, which will remove the + * rmap record as well. + */ + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + return 0; + } + return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, *aglenp, rs->oinfo); } @@ -395,6 +410,26 @@ xreap_agextent_iter( return 0; } + /* + * If we're getting rid of CoW staging extents, use deferred work items + * to remove the refcountbt records (which removes the rmap records) + * and free the extent. We're not worried about the system going down + * here because log recovery walks the refcount btree to clean out the + * CoW staging extents. + */ + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + ASSERT(rs->resv == XFS_AG_RESV_NONE); + + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, + rs->resv, true); + if (error) + return error; + + rs->force_roll = true; + return 0; + } + /* Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); @@ -409,13 +444,17 @@ xreap_agextent_iter( /* * Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. + * Add a defer ops barrier every other extent to avoid stressing the + * system with large EFIs. */ - error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, rs->resv, true); if (error) return error; rs->deferred++; + if (rs->deferred % 2 == 0) + xfs_defer_add_barrier(sc->tp); return 0; } @@ -425,13 +464,12 @@ xreap_agextent_iter( */ STATIC int xreap_agmeta_extent( - uint64_t fsbno, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xreap_state *rs = priv; struct xfs_scrub *sc = rs->sc; - xfs_agblock_t agbno = fsbno; xfs_agblock_t agbno_next = agbno + len; int error = 0; @@ -496,3 +534,115 @@ xrep_reap_agblocks( return 0; } + +/* + * Break a file metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. The extent must + * not cross an AG boundary. + */ +STATIC int +xreap_fsmeta_extent( + uint64_t fsbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + xfs_agblock_t agbno_next = agbno + len; + int error = 0; + + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(sc->ip != NULL); + ASSERT(!sc->sa.pag); + + /* + * We're reaping blocks after repairing file metadata, which means that + * we have to init the xchk_ag structure ourselves. + */ + sc->sa.pag = xfs_perag_get(sc->mp, agno); + if (!sc->sa.pag) + return -EFSCORRUPTED; + + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); + if (error) + goto out_pag; + + while (agbno < agbno_next) { + xfs_extlen_t aglen; + bool crosslinked; + + error = xreap_agextent_select(rs, agbno, agbno_next, + &crosslinked, &aglen); + if (error) + goto out_agf; + + error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); + if (error) + goto out_agf; + + if (xreap_want_defer_finish(rs)) { + /* + * Holds the AGF buffer across the deferred chain + * processing. + */ + error = xrep_defer_finish(sc); + if (error) + goto out_agf; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + /* + * Hold the AGF buffer across the transaction roll so + * that we don't have to reattach it to the scrub + * context. + */ + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + error = xfs_trans_roll_inode(&sc->tp, sc->ip); + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); + if (error) + goto out_agf; + xreap_reset(rs); + } + + agbno += aglen; + } + +out_agf: + xfs_trans_brelse(sc->tp, sc->sa.agf_bp); + sc->sa.agf_bp = NULL; +out_pag: + xfs_perag_put(sc->sa.pag); + sc->sa.pag = NULL; + return error; +} + +/* + * Dispose of every block of every fs metadata extent in the bitmap. + * Do not use this to dispose of the mappings in an ondisk inode fork. + */ +int +xrep_reap_fsblocks( + struct xfs_scrub *sc, + struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + + error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index fe24626af164..0b69f16dd98f 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -6,7 +6,12 @@ #ifndef __XFS_SCRUB_REAP_H__ #define __XFS_SCRUB_REAP_H__ +struct xagb_bitmap; +struct xfsb_bitmap; + int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); +int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo); #endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 304ea1e1bfb0..bf22f245bbfa 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -441,7 +441,7 @@ xchk_refcountbt_rec( struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) { + if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c new file mode 100644 index 000000000000..f38fccc42a20 --- /dev/null +++ b/fs/xfs/scrub/refcount_repair.c @@ -0,0 +1,794 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_error.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Rebuilding the Reference Count Btree + * ==================================== + * + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap + * entries as rectangles representing extents of physical blocks, and + * that the rectangles can be laid down to allow them to overlap each + * other; then we know that we must emit a refcnt btree entry wherever + * the amount of overlap changes, i.e. the emission stimulus is + * level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 + * cases because the bnobt tells us which blocks are free; single-use + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt + * supports storing multiple entries covering a given block we could + * theoretically dispense with the refcntbt and simply count rmaps, but + * that's inefficient in the (hot) write path, so we'll take the cost of + * the extra tree to save time. Also there's no guarantee that rmap + * will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting + * physical block (sp), a bag to hold rmaps that cover sp, and the next + * physical block where the level changes (np), we can reconstruct the + * refcount btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. This + * is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) + * and (startblock + len of each rmap in the bag). + * + * Like all the other repairers, we make a list of all the refcount + * records we need, then reinitialize the refcount btree root and + * insert all the records. + */ + +/* The only parts of the rmap that we care about for computing refcounts. */ +struct xrep_refc_rmap { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; +} __packed; + +struct xrep_refc { + /* refcount extents */ + struct xfarray *refcount_records; + + /* new refcountbt information */ + struct xrep_newbt new_btree; + + /* old refcountbt blocks */ + struct xagb_bitmap old_refcountbt_blocks; + + struct xfs_scrub *sc; + + /* get_records()'s position in the refcount record array. */ + xfarray_idx_t array_cur; + + /* # of refcountbt blocks */ + xfs_extlen_t btblocks; +}; + +/* Check for any obvious conflicts with this shared/CoW staging extent. */ +STATIC int +xrep_refc_check_ext( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock, + rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rc_startblock, rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record a reference count extent. */ +STATIC int +xrep_refc_stash( + struct xrep_refc *rr, + enum xfs_refc_domain domain, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t refcount) +{ + struct xfs_refcount_irec irec = { + .rc_startblock = agbno, + .rc_blockcount = len, + .rc_domain = domain, + }; + struct xfs_scrub *sc = rr->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); + + error = xrep_refc_check_ext(rr->sc, &irec); + if (error) + return error; + + trace_xrep_refc_found(sc->sa.pag, &irec); + + return xfarray_append(rr->refcount_records, &irec); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_refc_stash_cow( + struct xrep_refc *rr, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1); +} + +/* Decide if an rmap could describe a shared extent. */ +static inline bool +xrep_refc_rmap_shareable( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rmap) +{ + /* AG metadata are never sharable */ + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + + /* Metadata in files are never shareable */ + if (xfs_internal_inum(mp, rmap->rm_owner)) + return false; + + /* Metadata and unwritten file blocks are not shareable. */ + if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return false; + + return true; +} + +/* + * Walk along the reverse mapping records until we find one that could describe + * a shared extent. + */ +STATIC int +xrep_refc_walk_rmaps( + struct xrep_refc *rr, + struct xrep_refc_rmap *rrm, + bool *have_rec) +{ + struct xfs_rmap_irec rmap; + struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur; + struct xfs_mount *mp = cur->bc_mp; + int have_gt; + int error = 0; + + *have_rec = false; + + /* + * Loop through the remaining rmaps. Remember CoW staging + * extents and the refcountbt blocks from the old tree for later + * disposal. We can only share written data fork extents, so + * keep looping until we find an rmap for one. + */ + do { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + return error; + if (!have_gt) + return 0; + + error = xfs_rmap_get_rec(cur, &rmap, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(mp, !have_gt)) + return -EFSCORRUPTED; + + if (rmap.rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_refc_stash_cow(rr, rmap.rm_startblock, + rmap.rm_blockcount); + if (error) + return error; + } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) { + /* refcountbt block, dump it when we're done. */ + rr->btblocks += rmap.rm_blockcount; + error = xagb_bitmap_set(&rr->old_refcountbt_blocks, + rmap.rm_startblock, rmap.rm_blockcount); + if (error) + return error; + } + } while (!xrep_refc_rmap_shareable(mp, &rmap)); + + rrm->startblock = rmap.rm_startblock; + rrm->blockcount = rmap.rm_blockcount; + *have_rec = true; + return 0; +} + +static inline uint32_t +xrep_refc_encode_startblock( + const struct xfs_refcount_irec *irec) +{ + uint32_t start; + + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) + start |= XFS_REFC_COWFLAG; + + return start; +} + +/* Sort in the same order as the ondisk records. */ +static int +xrep_refc_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_refcount_irec *ap = a; + const struct xfs_refcount_irec *bp = b; + uint32_t sa, sb; + + sa = xrep_refc_encode_startblock(ap); + sb = xrep_refc_encode_startblock(bp); + + if (sa > sb) + return 1; + if (sa < sb) + return -1; + return 0; +} + +/* + * Sort the refcount extents by startblock or else the btree records will be in + * the wrong order. Make sure the records do not overlap in physical space. + */ +STATIC int +xrep_refc_sort_records( + struct xrep_refc *rr) +{ + struct xfs_refcount_irec irec; + xfarray_idx_t cur; + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rr->refcount_records, cur) { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfarray_load(rr->refcount_records, cur, &irec); + if (error) + return error; + + if (dom == XFS_REFC_DOMAIN_SHARED && + irec.rc_domain == XFS_REFC_DOMAIN_COW) { + dom = irec.rc_domain; + next_agbno = 0; + } + + if (dom != irec.rc_domain) + return -EFSCORRUPTED; + if (irec.rc_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = irec.rc_startblock + irec.rc_blockcount; + } + + return error; +} + +#define RRM_NEXT(r) ((r).startblock + (r).blockcount) +/* + * Find the next block where the refcount changes, given the next rmap we + * looked at and the ones we're already tracking. + */ +static inline int +xrep_refc_next_edge( + struct xfarray *rmap_bag, + struct xrep_refc_rmap *next_rrm, + bool next_valid, + xfs_agblock_t *nbnop) +{ + struct xrep_refc_rmap rrm; + xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t nbno = NULLAGBLOCK; + int error; + + if (next_valid) + nbno = next_rrm->startblock; + + while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1) + nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm)); + + if (error) + return error; + + /* + * We should have found /something/ because either next_rrm is the next + * interesting rmap to look at after emitting this refcount extent, or + * there are other rmaps in rmap_bag contributing to the current + * sharing count. But if something is seriously wrong, bail out. + */ + if (nbno == NULLAGBLOCK) + return -EFSCORRUPTED; + + *nbnop = nbno; + return 0; +} + +/* + * Walk forward through the rmap btree to collect all rmaps starting at + * @bno in @rmap_bag. These represent the file(s) that share ownership of + * the current block. Upon return, the rmap cursor points to the last record + * satisfying the startblock constraint. + */ +static int +xrep_refc_push_rmaps_at( + struct xrep_refc *rr, + struct xfarray *rmap_bag, + xfs_agblock_t bno, + struct xrep_refc_rmap *rrm, + bool *have, + uint64_t *stack_sz) +{ + struct xfs_scrub *sc = rr->sc; + int have_gt; + int error; + + while (*have && rrm->startblock == bno) { + error = xfarray_store_anywhere(rmap_bag, rrm); + if (error) + return error; + (*stack_sz)++; + error = xrep_refc_walk_rmaps(rr, rrm, have); + if (error) + return error; + } + + error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) + return -EFSCORRUPTED; + + return 0; +} + +/* Iterate all the rmap records to generate reference count data. */ +STATIC int +xrep_refc_find_refcounts( + struct xrep_refc *rr) +{ + struct xrep_refc_rmap rrm; + struct xfs_scrub *sc = rr->sc; + struct xfarray *rmap_bag; + char *descr; + uint64_t old_stack_sz; + uint64_t stack_sz = 0; + xfs_agblock_t sbno; + xfs_agblock_t cbno; + xfs_agblock_t nbno; + bool have; + int error; + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Set up a sparse array to store all the rmap records that we're + * tracking to generate a reference count record. If this exceeds + * MAXREFCOUNT, we clamp rc_refcount. + */ + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap), + &rmap_bag); + kfree(descr); + if (error) + goto out_cur; + + /* Start the rmapbt cursor to the left of all records. */ + error = xfs_btree_goto_left_edge(sc->sa.rmap_cur); + if (error) + goto out_bag; + + /* Process reverse mappings into refcount data. */ + while (xfs_btree_has_more_records(sc->sa.rmap_cur)) { + /* Push all rmaps with pblk == sbno onto the stack */ + error = xrep_refc_walk_rmaps(rr, &rrm, &have); + if (error) + goto out_bag; + if (!have) + break; + sbno = cbno = rrm.startblock; + error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno, + &rrm, &have, &stack_sz); + if (error) + goto out_bag; + + /* Set nbno to the bno of the next refcount change */ + error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + old_stack_sz = stack_sz; + + /* While stack isn't empty... */ + while (stack_sz) { + xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; + + /* Pop all rmaps that end at nbno */ + while ((error = xfarray_iter(rmap_bag, &array_cur, + &rrm)) == 1) { + if (RRM_NEXT(rrm) != nbno) + continue; + error = xfarray_unset(rmap_bag, array_cur - 1); + if (error) + goto out_bag; + stack_sz--; + } + if (error) + goto out_bag; + + /* Push array items that start at nbno */ + error = xrep_refc_walk_rmaps(rr, &rrm, &have); + if (error) + goto out_bag; + if (have) { + error = xrep_refc_push_rmaps_at(rr, rmap_bag, + nbno, &rrm, &have, &stack_sz); + if (error) + goto out_bag; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (stack_sz != old_stack_sz) { + if (old_stack_sz > 1) { + error = xrep_refc_stash(rr, + XFS_REFC_DOMAIN_SHARED, + cbno, nbno - cbno, + old_stack_sz); + if (error) + goto out_bag; + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (stack_sz == 0) + break; + old_stack_sz = stack_sz; + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + error = xrep_refc_next_edge(rmap_bag, &rrm, have, + &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + } + } + + ASSERT(stack_sz == 0); +out_bag: + xfarray_destroy(rmap_bag); +out_cur: + xchk_ag_btcur_free(&sc->sa); + return error; +} +#undef RRM_NEXT + +/* Retrieve refcountbt data for bulk load. */ +STATIC int +xrep_refc_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + struct xrep_refc *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(rr->refcount_records, rr->array_cur++, + irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_refc_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_refc *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Update the AGF counters. */ +STATIC int +xrep_refc_reset_counters( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the refcountbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_refcount_level = pag->pagf_refcount_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected refcount information to stage a new refcount btree. If + * this is successful we'll return with the new btree root information logged + * to the repair transaction but not yet committed. + */ +STATIC int +xrep_refc_build_new_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *refc_cur; + struct xfs_perag *pag = sc->sa.pag; + xfs_fsblock_t fsbno; + int error; + + error = xrep_refc_sort_records(rr); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno, + XFS_AG_RESV_METADATA); + rr->new_btree.bload.get_records = xrep_refc_get_records; + rr->new_btree.bload.claim_block = xrep_refc_claim_block; + + /* Compute how many blocks we'll need. */ + refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake, + pag); + error = xfs_btree_bload_compute_geometry(refc_cur, + &rr->new_btree.bload, + xfarray_length(rr->refcount_records)); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height; + + /* Add all observed refcount records. */ + rr->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(refc_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_refc_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_refcount_level = 0; +err_cur: + xfs_btree_del_cursor(refc_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_refc_remove_old_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + int error; + + /* Free the old refcountbt blocks if they're not in use. */ + error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks, + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + if (error) + return error; + + /* + * Now that we've zapped all the old refcountbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservations. + */ + pag->pagf_repair_refcount_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; + return 0; +} + +/* Rebuild the refcount btree. */ +int +xrep_refcountbt( + struct xfs_scrub *sc) +{ + struct xrep_refc *rr; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + rr->sc = sc; + + /* Set up enough storage to handle one refcount record per block. */ + descr = xchk_xfile_ag_descr(sc, "reference count records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks, + sizeof(struct xfs_refcount_irec), + &rr->refcount_records); + kfree(descr); + if (error) + goto out_rr; + + /* Collect all reference counts. */ + xagb_bitmap_init(&rr->old_refcountbt_blocks); + error = xrep_refc_find_refcounts(rr); + if (error) + goto out_bitmap; + + /* Rebuild the refcount information. */ + error = xrep_refc_build_new_tree(rr); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_refc_remove_old_tree(rr); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&rr->old_refcountbt_blocks); + xfarray_destroy(rr->refcount_records); +out_rr: + kfree(rr); + return error; +} diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1b8b5439f2d7..745d5b8f405a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -27,6 +27,9 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_defer.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_reflink.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -176,6 +179,16 @@ xrep_roll_ag_trans( return 0; } +/* Roll the scrub transaction, holding the primary metadata locked. */ +int +xrep_roll_trans( + struct xfs_scrub *sc) +{ + if (!sc->ip) + return xrep_roll_ag_trans(sc); + return xfs_trans_roll_inode(&sc->tp, sc->ip); +} + /* Finish all deferred work attached to the repair transaction. */ int xrep_defer_finish( @@ -673,6 +686,7 @@ xrep_find_ag_btree_roots( return error; } +#ifdef CONFIG_XFS_QUOTA /* Force a quotacheck the next time we mount. */ void xrep_force_quotacheck( @@ -699,10 +713,10 @@ xrep_force_quotacheck( * * This function ensures that the appropriate dquots are attached to an inode. * We cannot allow the dquot code to allocate an on-disk dquot block here - * because we're already in transaction context with the inode locked. The - * on-disk dquot should already exist anyway. If the quota code signals - * corruption or missing quota information, schedule quotacheck, which will - * repair corruptions in the quota metadata. + * because we're already in transaction context. The on-disk dquot should + * already exist anyway. If the quota code signals corruption or missing quota + * information, schedule quotacheck, which will repair corruptions in the quota + * metadata. */ int xrep_ino_dqattach( @@ -710,7 +724,10 @@ xrep_ino_dqattach( { int error; - error = xfs_qm_dqattach_locked(sc->ip, false); + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + error = xfs_qm_dqattach(sc->ip); switch (error) { case -EFSBADCRC: case -EFSCORRUPTED: @@ -734,3 +751,367 @@ xrep_ino_dqattach( return error; } +#endif /* CONFIG_XFS_QUOTA */ + +/* + * Ensure that the inode being repaired is ready to handle a certain number of + * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode + * being repaired and have joined it to the scrub transaction. + */ +int +xrep_ino_ensure_extent_count( + struct xfs_scrub *sc, + int whichfork, + xfs_extnum_t nextents) +{ + xfs_extnum_t max_extents; + bool inode_has_nrext64; + + inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip); + max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork); + if (nextents <= max_extents) + return 0; + if (inode_has_nrext64) + return -EFSCORRUPTED; + if (!xfs_has_large_extent_counts(sc->mp)) + return -EFSCORRUPTED; + + max_extents = xfs_iext_max_nextents(true, whichfork); + if (nextents > max_extents) + return -EFSCORRUPTED; + + sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return 0; +} + +/* + * Initialize all the btree cursors for an AG repair except for the btree that + * we're rebuilding. + */ +void +xrep_ag_btcur_init( + struct xfs_scrub *sc, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + /* Set up a bnobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) { + sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); + sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); + } + + /* Set up a inobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) { + sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, + sa->agi_bp, XFS_BTNUM_INO); + if (xfs_has_finobt(mp)) + sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag, + sc->tp, sa->agi_bp, XFS_BTNUM_FINO); + } + + /* Set up a rmapbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT && + xfs_has_rmapbt(mp)) + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + + /* Set up a refcountbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT && + xfs_has_reflink(mp)) + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sc->sa.pag); +} + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGF + * buffer. We had better get the same AGF buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagf( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agf(pag)); + + clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp); + if (error) + return error; + + if (bp != sc->sa.agf_bp) { + ASSERT(bp == sc->sa.agf_bp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGI + * buffer. We had better get the same AGI buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagi( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agi(pag)); + + clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); + error = xfs_ialloc_read_agi(pag, sc->tp, &bp); + if (error) + return error; + + if (bp != sc->sa.agi_bp) { + ASSERT(bp == sc->sa.agi_bp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Given an active reference to a perag structure, load AG headers and cursors. + * This should only be called to scan an AG while repairing file-based metadata. + */ +int +xrep_ag_init( + struct xfs_scrub *sc, + struct xfs_perag *pag, + struct xchk_ag *sa) +{ + int error; + + ASSERT(!sa->pag); + + error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp); + if (error) + return error; + + error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp); + if (error) + return error; + + /* Grab our own passive reference from the caller's ref. */ + sa->pag = xfs_perag_hold(pag); + xrep_ag_btcur_init(sc, sa); + return 0; +} + +/* Reinitialize the per-AG block reservation for the AG we just fixed. */ +int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + int error; + + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(sc->sa.pag != NULL); + ASSERT(sc->ops->type == ST_PERAG); + ASSERT(sc->tp); + + sc->flags &= ~XREP_RESET_PERAG_RESV; + error = xfs_ag_resv_free(sc->sa.pag); + if (error) + goto out; + error = xfs_ag_resv_init(sc->sa.pag, sc->tp); + if (error == -ENOSPC) { + xfs_err(sc->mp, +"Insufficient free space to reset per-AG reservation for AG %u after repair.", + sc->sa.pag->pag_agno); + error = 0; + } + +out: + return error; +} + +/* Decide if we are going to call the repair function for a scrub type. */ +bool +xrep_will_attempt( + struct xfs_scrub *sc) +{ + /* Userspace asked us to rebuild the structure regardless. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) + return true; + + /* Let debug users force us into the repair routines. */ + if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + return true; + + /* Metadata is corrupt or failed cross-referencing. */ + if (xchk_needs_repair(sc->sm)) + return true; + + return false; +} + +/* Try to fix some part of a metadata inode by calling another scrubber. */ +STATIC int +xrep_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + __u32 smtype = sc->sm->sm_type; + __u32 smflags = sc->sm->sm_flags; + unsigned int sick_mask = sc->sick_mask; + int error; + + /* + * Let's see if the inode needs repair. We're going to open-code calls + * to the scrub and repair functions so that we can hang on to the + * resources that we already acquired instead of using the standard + * setup/teardown routines. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + sc->sm->sm_type = scrub_type; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xchk_bmap_attr(sc); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + if (error) + goto out; + + if (!xrep_will_attempt(sc)) + goto out; + + /* + * Repair some part of the inode. This will potentially join the inode + * to the transaction. + */ + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xrep_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xrep_bmap(sc, XFS_DATA_FORK, false); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xrep_bmap(sc, XFS_ATTR_FORK, false); + break; + } + if (error) + goto out; + + /* + * Finish all deferred intent items and then roll the transaction so + * that the inode will not be joined to the transaction when we exit + * the function. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + error = xfs_trans_roll(&sc->tp); + if (error) + goto out; + + /* + * Clear the corruption flags and re-check the metadata that we just + * repaired. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xchk_bmap_attr(sc); + break; + } + if (error) + goto out; + + /* If corruption persists, the repair has failed. */ + if (xchk_needs_repair(sc->sm)) { + error = -EFSCORRUPTED; + goto out; + } +out: + sc->sick_mask = sick_mask; + sc->sm->sm_type = smtype; + sc->sm->sm_flags = smflags; + return error; +} + +/* + * Repair the ondisk forks of a metadata inode. The caller must ensure that + * sc->ip points to the metadata inode and the ILOCK is held on that inode. + * The inode must not be joined to the transaction before the call, and will + * not be afterwards. + */ +int +xrep_metadata_inode_forks( + struct xfs_scrub *sc) +{ + bool dirty = false; + int error; + + /* Repair the inode record and the data fork. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error) + return error; + + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); + if (error) + return error; + + /* Make sure the attr fork looks ok before we delete it. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error) + return error; + + /* Clear the reflink flag since metadata never shares. */ + if (xfs_is_reflink_inode(sc->ip)) { + dirty = true; + xfs_trans_ijoin(sc->tp, sc->ip, 0); + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + /* + * If we modified the inode, roll the transaction but don't rejoin the + * inode to the new transaction because xrep_bmap_data can do that. + */ + if (dirty) { + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + dirty = false; + } + + return 0; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60d2a9ae5f2e..17114327e6fa 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -28,15 +28,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); +bool xrep_will_attempt(struct xfs_scrub *sc); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); +int xrep_roll_trans(struct xfs_scrub *sc); int xrep_defer_finish(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); +static inline int +xrep_trans_commit( + struct xfs_scrub *sc) +{ + int error = xfs_trans_commit(sc->tp); + + sc->tp = NULL; + return error; +} + struct xbitmap; struct xagb_bitmap; +struct xfsb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); @@ -57,8 +70,35 @@ struct xrep_find_ag_btree { int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); + +#ifdef CONFIG_XFS_QUOTA void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); +#else +# define xrep_force_quotacheck(sc, type) ((void)0) +# define xrep_ino_dqattach(sc) (0) +#endif /* CONFIG_XFS_QUOTA */ + +int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, + xfs_extnum_t nextents); +int xrep_reset_perag_resv(struct xfs_scrub *sc); +int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); +int xrep_metadata_inode_forks(struct xfs_scrub *sc); + +/* Repair setup functions */ +int xrep_setup_ag_allocbt(struct xfs_scrub *sc); + +struct xfs_imap; +int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); + +void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); +int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag, + struct xchk_ag *sa); + +/* Metadata revalidators */ + +int xrep_revalidate_allocbt(struct xfs_scrub *sc); +int xrep_revalidate_iallocbt(struct xfs_scrub *sc); /* Metadata repairers */ @@ -67,9 +107,34 @@ int xrep_superblock(struct xfs_scrub *sc); int xrep_agf(struct xfs_scrub *sc); int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); +int xrep_allocbt(struct xfs_scrub *sc); +int xrep_iallocbt(struct xfs_scrub *sc); +int xrep_refcountbt(struct xfs_scrub *sc); +int xrep_inode(struct xfs_scrub *sc); +int xrep_bmap_data(struct xfs_scrub *sc); +int xrep_bmap_attr(struct xfs_scrub *sc); +int xrep_bmap_cow(struct xfs_scrub *sc); + +#ifdef CONFIG_XFS_RT +int xrep_rtbitmap(struct xfs_scrub *sc); +#else +# define xrep_rtbitmap xrep_notsupported +#endif /* CONFIG_XFS_RT */ + +#ifdef CONFIG_XFS_QUOTA +int xrep_quota(struct xfs_scrub *sc); +#else +# define xrep_quota xrep_notsupported +#endif /* CONFIG_XFS_QUOTA */ + +int xrep_reinit_pagf(struct xfs_scrub *sc); +int xrep_reinit_pagi(struct xfs_scrub *sc); #else +#define xrep_ino_dqattach(sc) (0) +#define xrep_will_attempt(sc) (false) + static inline int xrep_attempt( struct xfs_scrub *sc, @@ -87,11 +152,45 @@ xrep_calc_ag_resblks( return 0; } +static inline int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(0); + return -EOPNOTSUPP; +} + +/* repair setup functions for no-repair */ +static inline int +xrep_setup_nothing( + struct xfs_scrub *sc) +{ + return 0; +} +#define xrep_setup_ag_allocbt xrep_setup_nothing + +#define xrep_setup_inode(sc, imap) ((void)0) + +#define xrep_revalidate_allocbt (NULL) +#define xrep_revalidate_iallocbt (NULL) + #define xrep_probe xrep_notsupported #define xrep_superblock xrep_notsupported #define xrep_agf xrep_notsupported #define xrep_agfl xrep_notsupported #define xrep_agi xrep_notsupported +#define xrep_allocbt xrep_notsupported +#define xrep_iallocbt xrep_notsupported +#define xrep_refcountbt xrep_notsupported +#define xrep_inode xrep_notsupported +#define xrep_bmap_data xrep_notsupported +#define xrep_bmap_attr xrep_notsupported +#define xrep_bmap_cow xrep_notsupported +#define xrep_rtbitmap xrep_notsupported +#define xrep_quota xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index d29a26ecddd6..c99d1714f283 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -24,6 +24,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" /* * Set us up to scrub reverse mapping btrees. diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 41a1d89ae8e6..441ca9977652 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -14,17 +14,33 @@ #include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bit.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/rtbitmap.h" /* Set us up with the realtime metadata locked. */ int xchk_setup_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb; int error; - error = xchk_trans_alloc(sc, 0); + rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS); + if (!rtb) + return -ENOMEM; + sc->buf = rtb; + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtbitmap(sc, rtb); + if (error) + return error; + } + + error = xchk_trans_alloc(sc, rtb->resblks); if (error) return error; @@ -32,7 +48,22 @@ xchk_setup_rtbitmap( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + + /* + * Now that we've locked the rtbitmap, we can't race with growfsrt + * trying to expand the bitmap or change the size of the rt volume. + * Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rtb->rextslog = xfs_compute_rextslog(rtb->rextents); + rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents); + } return 0; } @@ -63,21 +94,30 @@ STATIC int xchk_rtbitmap_check_extents( struct xfs_scrub *sc) { - struct xfs_mount *mp = sc->mp; struct xfs_bmbt_irec map; - xfs_rtblock_t off; - int nmap; + struct xfs_iext_cursor icur; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; int error = 0; - for (off = 0; off < mp->m_sb.sb_rbmblocks;) { + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } + + while (off < endoff) { + int nmap = 1; + if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) break; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rbmip, off, - mp->m_sb.sb_rbmblocks - off, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) break; @@ -98,12 +138,48 @@ int xchk_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb = sc->buf; int error; - /* Is the size of the rtbitmap correct? */ - if (sc->mp->m_rbmip->i_disk_size != - XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { - xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rtb->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* Is sb_rextslog correct? */ + if (mp->m_sb.sb_rextslog != rtb->rextslog) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is sb_rbmblocks large enough to handle the current rt volume? In no + * case can we exceed 4bn bitmap blocks since the super field is a u32. + */ + if (rtb->rbmblocks > U32_MAX) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* The bitmap file length must be aligned to an fsblock. */ + if (mp->m_rbmip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is the bitmap file itself large enough to handle the rt volume? + * growfsrt expands the bitmap file before updating sb_rextents, so the + * file can be larger than sb_rbmblocks. + */ + if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); return 0; } @@ -116,12 +192,11 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; -out: - return error; + return 0; } /* xref check that the extent is not free in the rtbitmap */ diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h new file mode 100644 index 000000000000..85304ff019e1 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTBITMAP_H__ +#define __XFS_SCRUB_RTBITMAP_H__ + +struct xchk_rtbitmap { + uint64_t rextents; + uint64_t rbmblocks; + unsigned int rextslog; + unsigned int resblks; +}; + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb); +#else +# define xrep_setup_rtbitmap(sc, rtb) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTBITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c new file mode 100644 index 000000000000..46f5d5f605c9 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/rtbitmap.h" + +/* Set up to repair the realtime bitmap file metadata. */ +int +xrep_setup_rtbitmap( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + + /* + * Reserve enough blocks to write out a completely new bmbt for a + * maximally fragmented bitmap file. We do not hold the rtbitmap + * ILOCK yet, so this is entirely speculative. + */ + blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rtb->resblks += blocks; + return 0; +} + +/* + * Make sure that the given range of the data fork of the realtime file is + * mapped to written blocks. The caller must ensure that the inode is joined + * to the transaction. + */ +STATIC int +xrep_rtbitmap_data_mappings( + struct xfs_scrub *sc, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t off = 0; + int error; + + ASSERT(sc->ip != NULL); + + while (off < len) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->ip, off, len - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + /* + * Written extents are ok. Holes are not filled because we + * do not know the freespace information. + */ + if (xfs_bmap_is_written_extent(&map) || + map.br_startblock == HOLESTARTBLOCK) { + off = map.br_startoff + map.br_blockcount; + continue; + } + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* Make sure we're really converting an unwritten extent. */ + if (map.br_state != XFS_EXT_UNWRITTEN) { + ASSERT(map.br_state == XFS_EXT_UNWRITTEN); + return -EFSCORRUPTED; + } + + /* Make sure this block has a real zeroed extent mapped. */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->ip, map.br_startoff, + map.br_blockcount, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, + 0, &map, &nmaps); + if (error) + return error; + if (nmaps != 1) + return -EFSCORRUPTED; + + /* Commit new extent and all deferred work. */ + error = xrep_defer_finish(sc); + if (error) + return error; + + off = map.br_startoff + map.br_blockcount; + } + + return 0; +} + +/* Fix broken rt volume geometry. */ +STATIC int +xrep_rtbitmap_geometry( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + + /* Superblock fields */ + if (mp->m_sb.sb_rextents != rtb->rextents) + xfs_trans_mod_sb(sc->tp, XFS_TRANS_SB_REXTENTS, + rtb->rextents - mp->m_sb.sb_rextents); + + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + rtb->rbmblocks - mp->m_sb.sb_rbmblocks); + + if (mp->m_sb.sb_rextslog != rtb->rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + rtb->rextslog - mp->m_sb.sb_rextslog); + + /* Fix broken isize */ + sc->ip->i_disk_size = roundup_64(sc->ip->i_disk_size, + mp->m_sb.sb_blocksize); + + if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) + sc->ip->i_disk_size = XFS_FSB_TO_B(mp, rtb->rbmblocks); + + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair the realtime bitmap file metadata. */ +int +xrep_rtbitmap( + struct xfs_scrub *sc) +{ + struct xchk_rtbitmap *rtb = sc->buf; + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + int error; + + /* Impossibly large rtbitmap means we can't touch the filesystem. */ + if (rtb->rbmblocks > U32_MAX) + return 0; + + /* + * If the size of the rt bitmap file is larger than what we reserved, + * figure out if we need to adjust the block reservation in the + * transaction. + */ + blocks = xfs_bmbt_calc_size(mp, rtb->rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + if (blocks > rtb->resblks) { + error = xfs_trans_reserve_more(sc->tp, blocks, 0); + if (error) + return error; + + rtb->resblks += blocks; + } + + /* Fix inode core and forks. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Ensure no unwritten extents. */ + error = xrep_rtbitmap_data_mappings(sc, rtb->rbmblocks); + if (error) + return error; + + /* Fix inconsistent bitmap geometry */ + return xrep_rtbitmap_geometry(sc, rtb); +} diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 8b15c47408d0..fabd0ed9dfa6 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -31,6 +31,18 @@ * (potentially large) amount of data in pageable memory. */ +struct xchk_rtsummary { + struct xfs_rtalloc_args args; + + uint64_t rextents; + uint64_t rbmblocks; + uint64_t rsumsize; + unsigned int rsumlevels; + + /* Memory buffer for the summary comparison. */ + union xfs_suminfo_raw words[]; +}; + /* Set us up to check the rtsummary file. */ int xchk_setup_rtsummary( @@ -38,8 +50,15 @@ xchk_setup_rtsummary( { struct xfs_mount *mp = sc->mp; char *descr; + struct xchk_rtsummary *rts; int error; + rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize), + XCHK_GFP_FLAGS); + if (!rts) + return -ENOMEM; + sc->buf = rts; + /* * Create an xfile to construct a new rtsummary file. The xfile allows * us to avoid pinning kernel memory for this purpose. @@ -54,15 +73,14 @@ xchk_setup_rtsummary( if (error) return error; - /* Allocate a memory buffer for the summary comparison. */ - sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS); - if (!sc->buf) - return -ENOMEM; - error = xchk_install_live_inode(sc, mp->m_rsumip); if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + /* * Locking order requires us to take the rtbitmap first. We must be * careful to unlock it ourselves when we are done with the rtbitmap @@ -71,13 +89,29 @@ xchk_setup_rtsummary( */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + + /* + * Now that we've locked the rtbitmap and rtsummary, we can't race with + * growfsrt trying to expand the summary or change the size of the rt + * volume. Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + xfs_filblks_t rsumblocks; + int rextslog; + + rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rextslog = xfs_compute_rextslog(rts->rextents); + rts->rsumlevels = rextslog + 1; + rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents); + rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, + rts->rbmblocks); + rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks); + } return 0; } /* Helper functions to record suminfo words in an xfile. */ -typedef unsigned int xchk_rtsumoff_t; - static inline int xfsum_load( struct xfs_scrub *sc, @@ -143,7 +177,7 @@ xchk_rtsum_record_free( /* Compute the relevant location in the rtsum file. */ rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext); - lenlog = XFS_RTBLOCKLOG(rec->ar_extcount); + lenlog = xfs_highbit64(rec->ar_extcount); offs = xfs_rtsumoffs(mp, lenlog, rbmoff); rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); @@ -188,19 +222,29 @@ STATIC int xchk_rtsum_compare( struct xfs_scrub *sc) { - struct xfs_rtalloc_args args = { - .mp = sc->mp, - .tp = sc->tp, - }; - struct xfs_mount *mp = sc->mp; struct xfs_bmbt_irec map; - xfs_fileoff_t off; - xchk_rtsumoff_t sumoff = 0; - int nmap; + struct xfs_iext_cursor icur; + + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xchk_rtsummary *rts = sc->buf; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; + xfs_rtsumoff_t sumoff = 0; + int error = 0; + + rts->args.mp = sc->mp; + rts->args.tp = sc->tp; + + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } - for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { - union xfs_suminfo_raw *ondisk_info; - int error = 0; + while (off < endoff) { + int nmap = 1; if (xchk_should_terminate(sc, &error)) return error; @@ -208,8 +252,7 @@ xchk_rtsum_compare( return 0; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; @@ -219,24 +262,33 @@ xchk_rtsum_compare( return 0; } + off += map.br_blockcount; + } + + for (off = 0; off < endoff; off++) { + union xfs_suminfo_raw *ondisk_info; + /* Read a block's worth of ondisk rtsummary file. */ - error = xfs_rtsummary_read_buf(&args, off); + error = xfs_rtsummary_read_buf(&rts->args, off); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; /* Read a block's worth of computed rtsummary file. */ - error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); + error = xfsum_copyout(sc, sumoff, rts->words, mp->m_blockwsize); if (error) { - xfs_rtbuf_cache_relse(&args); + xfs_rtbuf_cache_relse(&rts->args); return error; } - ondisk_info = xfs_rsumblock_infoptr(&args, 0); - if (memcmp(ondisk_info, sc->buf, - mp->m_blockwsize << XFS_WORDLOG) != 0) + ondisk_info = xfs_rsumblock_infoptr(&rts->args, 0); + if (memcmp(ondisk_info, rts->words, + mp->m_blockwsize << XFS_WORDLOG) != 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + xfs_rtbuf_cache_relse(&rts->args); + return error; + } - xfs_rtbuf_cache_relse(&args); + xfs_rtbuf_cache_relse(&rts->args); sumoff += mp->m_blockwsize; } @@ -249,8 +301,43 @@ xchk_rtsummary( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xchk_rtsummary *rts = sc->buf; int error = 0; + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rts->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + goto out_rbm; + } + + /* Is m_rsumlevels correct? */ + if (mp->m_rsumlevels != rts->rsumlevels) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* Is m_rsumsize correct? */ + if (mp->m_rsumsize != rts->rsumsize) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* The summary file length must be aligned to an fsblock. */ + if (mp->m_rsumip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* + * Is the summary file itself large enough to handle the rt volume? + * growfsrt expands the summary file before updating sb_rextents, so + * the file can be larger than rsumsize. + */ + if (mp->m_rsumip->i_disk_size < rts->rsumsize) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4849efcaa33a..caf324c2b991 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -14,8 +14,6 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_qm.h" -#include "xfs_errortag.h" -#include "xfs_error.h" #include "xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -238,27 +236,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_bnobt, - .repair = xrep_notsupported, + .scrub = xchk_allocbt, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_cntbt, - .repair = xrep_notsupported, + .scrub = xchk_allocbt, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_inobt, - .repair = xrep_notsupported, + .scrub = xchk_iallocbt, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_finobt, + .scrub = xchk_iallocbt, .has = xfs_has_finobt, - .repair = xrep_notsupported, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, @@ -272,31 +274,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .setup = xchk_setup_ag_refcountbt, .scrub = xchk_refcountbt, .has = xfs_has_reflink, - .repair = xrep_notsupported, + .repair = xrep_refcountbt, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ .type = ST_INODE, .setup = xchk_setup_inode, .scrub = xchk_inode, - .repair = xrep_notsupported, + .repair = xrep_inode, }, [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_data, - .repair = xrep_notsupported, + .repair = xrep_bmap_data, }, [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_attr, - .repair = xrep_notsupported, + .repair = xrep_bmap_attr, }, [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_cow, - .repair = xrep_notsupported, + .repair = xrep_bmap_cow, }, [XFS_SCRUB_TYPE_DIR] = { /* directory */ .type = ST_INODE, @@ -326,33 +328,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, - .has = xfs_has_realtime, - .repair = xrep_notsupported, + .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, - .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ .type = ST_FS, @@ -531,7 +531,10 @@ retry_op: /* Scrub for errors. */ check_start = xchk_stats_now(); - error = sc->ops->scrub(sc); + if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL) + error = sc->ops->repair_eval(sc); + else + error = sc->ops->scrub(sc); run.scrub_ns += xchk_stats_elapsed_ns(check_start); if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) goto try_harder; @@ -542,23 +545,12 @@ retry_op: xchk_update_health(sc); - if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && - !(sc->flags & XREP_ALREADY_FIXED)) { - bool needs_fix = xchk_needs_repair(sc->sm); - - /* Userspace asked us to rebuild the structure regardless. */ - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) - needs_fix = true; - - /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) - needs_fix = true; - + if (xchk_could_repair(sc)) { /* * If userspace asked for a repair but it wasn't necessary, * report that back to userspace. */ - if (!needs_fix) { + if (!xrep_will_attempt(sc)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; goto out_nofix; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 1ef9c6b4842a..7fc50654c4fe 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -35,6 +35,14 @@ struct xchk_meta_ops { /* Repair or optimize the metadata. */ int (*repair)(struct xfs_scrub *); + /* + * Re-scrub the metadata we repaired, in case there's extra work that + * we need to do to check our repair work. If this is NULL, we'll use + * the ->scrub function pointer, assuming that the regular scrub is + * sufficient. + */ + int (*repair_eval)(struct xfs_scrub *sc); + /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_mount *); @@ -113,6 +121,7 @@ struct xfs_scrub { #define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ /* @@ -129,10 +138,8 @@ int xchk_superblock(struct xfs_scrub *sc); int xchk_agf(struct xfs_scrub *sc); int xchk_agfl(struct xfs_scrub *sc); int xchk_agi(struct xfs_scrub *sc); -int xchk_bnobt(struct xfs_scrub *sc); -int xchk_cntbt(struct xfs_scrub *sc); -int xchk_inobt(struct xfs_scrub *sc); -int xchk_finobt(struct xfs_scrub *sc); +int xchk_allocbt(struct xfs_scrub *sc); +int xchk_iallocbt(struct xfs_scrub *sc); int xchk_rmapbt(struct xfs_scrub *sc); int xchk_refcountbt(struct xfs_scrub *sc); int xchk_inode(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 38708fb9a5d7..ddff86713df3 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -12,8 +12,10 @@ #include "xfs_log_format.h" #include "xfs_inode.h" #include "xfs_symlink.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/health.h" /* Set us up to scrub a symbolic link. */ int @@ -41,29 +43,37 @@ xchk_symlink( if (!S_ISLNK(VFS_I(ip)->i_mode)) return -ENOENT; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_SYMLINK_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); len = ip->i_disk_size; /* Plausible size? */ if (len > XFS_SYMLINK_MAXLEN || len <= 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Inline symlink? */ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { if (len > xfs_inode_data_fork_size(ip) || - len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) + len > strnlen(ifp->if_data, xfs_inode_data_fork_size(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Remote symlink; must read the contents. */ error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); -out: - return error; + + /* If a remote symlink is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); + return 0; } diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 29afa4851235..d0e24ffaf754 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -14,9 +14,12 @@ #include "xfs_btree.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_quota.h" +#include "xfs_quota_defs.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" +#include "scrub/quota.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 4a8bc6f3c8f2..6bbb4e8639dc 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -19,6 +19,7 @@ struct xfile; struct xfarray; struct xfarray_sortinfo; +struct xchk_dqiter; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -106,6 +107,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ + { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } DECLARE_EVENT_CLASS(xchk_class, @@ -347,6 +349,54 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \ DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error); DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning); +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xchk_dqiter_class, + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), + TP_ARGS(cursor, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, dqtype) + __field(xfs_ino_t, ino) + __field(unsigned long long, cur_id) + __field(unsigned long long, id) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + ), + TP_fast_assign( + __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev; + __entry->dqtype = cursor->dqtype; + __entry->ino = cursor->quota_ip->i_ino; + __entry->cur_id = cursor->id; + __entry->startoff = cursor->bmap.br_startoff; + __entry->startblock = cursor->bmap.br_startblock; + __entry->blockcount = cursor->bmap.br_blockcount; + __entry->state = cursor->bmap.br_state; + __entry->id = id; + ), + TP_printk("dev %d:%d dquot type %s ino 0x%llx cursor_id 0x%llx startoff 0x%llx startblock 0x%llx blockcount 0x%llx state %u id 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS), + __entry->ino, + __entry->cur_id, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->id) +); + +#define DEFINE_SCRUB_DQITER_EVENT(name) \ +DEFINE_EVENT(xchk_dqiter_class, name, \ + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), \ + TP_ARGS(cursor, id)) +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter); +#endif /* CONFIG_XFS_QUOTA */ + TRACE_EVENT(xchk_incomplete, TP_PROTO(struct xfs_scrub *sc, void *ret_ip), TP_ARGS(sc, ret_ip), @@ -1172,37 +1222,125 @@ DEFINE_EVENT(xrep_rmap_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); -TRACE_EVENT(xrep_refcount_extent_fn, +TRACE_EVENT(xrep_abt_found, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *irec), - TP_ARGS(mp, agno, irec), + const struct xfs_alloc_rec_incore *rec), + TP_ARGS(mp, agno, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) - __field(xfs_nlink_t, refcount) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->startblock = irec->rc_startblock; - __entry->blockcount = irec->rc_blockcount; - __entry->refcount = irec->rc_refcount; + __entry->startblock = rec->ar_startblock; + __entry->blockcount = rec->ar_blockcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount) +) + +TRACE_EVENT(xrep_ibt_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_inobt_rec_incore *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint16_t, holemask) + __field(uint8_t, count) + __field(uint8_t, freecount) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = rec->ir_startino; + __entry->holemask = rec->ir_holemask; + __entry->count = rec->ir_count; + __entry->freecount = rec->ir_freecount; + __entry->freemask = rec->ir_free; + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->holemask, + __entry->count, + __entry->freecount, + __entry->freemask) +) + +TRACE_EVENT(xrep_refc_found, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec), + TP_ARGS(pag, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = rec->rc_domain; + __entry->startblock = rec->rc_startblock; + __entry->blockcount = rec->rc_blockcount; + __entry->refcount = rec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) ) +TRACE_EVENT(xrep_bmap_found, + TP_PROTO(struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(ip, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); + TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), @@ -1299,39 +1437,327 @@ TRACE_EVENT(xrep_reset_counters, MAJOR(__entry->dev), MINOR(__entry->dev)) ) -TRACE_EVENT(xrep_ialloc_insert, +DECLARE_EVENT_CLASS(xrep_newbt_extent_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino, uint16_t holemask, uint8_t count, - uint8_t freecount, uint64_t freemask), - TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask), + xfs_agblock_t agbno, xfs_extlen_t len, + int64_t owner), + TP_ARGS(mp, agno, agbno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(xfs_agino_t, startino) - __field(uint16_t, holemask) - __field(uint8_t, count) - __field(uint8_t, freecount) - __field(uint64_t, freemask) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int64_t, owner) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->startino = startino; - __entry->holemask = holemask; - __entry->count = count; - __entry->freecount = freecount; - __entry->freemask = freemask; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; ), - TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->startino, - __entry->holemask, - __entry->count, - __entry->freecount, - __entry->freemask) + __entry->agbno, + __entry->len, + __entry->owner) +); +#define DEFINE_NEWBT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xrep_newbt_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + int64_t owner), \ + TP_ARGS(mp, agno, agbno, len, owner)) +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block); + +DECLARE_EVENT_CLASS(xrep_dinode_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), + TP_ARGS(sc, dip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint16_t, mode) + __field(uint8_t, version) + __field(uint8_t, format) + __field(uint32_t, uid) + __field(uint32_t, gid) + __field(uint64_t, size) + __field(uint64_t, nblocks) + __field(uint32_t, extsize) + __field(uint32_t, nextents) + __field(uint16_t, anextents) + __field(uint8_t, forkoff) + __field(uint8_t, aformat) + __field(uint16_t, flags) + __field(uint32_t, gen) + __field(uint64_t, flags2) + __field(uint32_t, cowextsize) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->mode = be16_to_cpu(dip->di_mode); + __entry->version = dip->di_version; + __entry->format = dip->di_format; + __entry->uid = be32_to_cpu(dip->di_uid); + __entry->gid = be32_to_cpu(dip->di_gid); + __entry->size = be64_to_cpu(dip->di_size); + __entry->nblocks = be64_to_cpu(dip->di_nblocks); + __entry->extsize = be32_to_cpu(dip->di_extsize); + __entry->nextents = be32_to_cpu(dip->di_nextents); + __entry->anextents = be16_to_cpu(dip->di_anextents); + __entry->forkoff = dip->di_forkoff; + __entry->aformat = dip->di_aformat; + __entry->flags = be16_to_cpu(dip->di_flags); + __entry->gen = be32_to_cpu(dip->di_gen); + __entry->flags2 = be64_to_cpu(dip->di_flags2); + __entry->cowextsize = be32_to_cpu(dip->di_cowextsize); + ), + TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->mode, + __entry->version, + __entry->format, + __entry->uid, + __entry->gid, + __entry->size, + __entry->nblocks, + __entry->extsize, + __entry->nextents, + __entry->anextents, + __entry->forkoff, + __entry->aformat, + __entry->flags, + __entry->gen, + __entry->flags2, + __entry->cowextsize) ) +#define DEFINE_REPAIR_DINODE_EVENT(name) \ +DEFINE_EVENT(xrep_dinode_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \ + TP_ARGS(sc, dip)) +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_forks); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dfork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_afork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_ensure_forkoff); + +DECLARE_EVENT_CLASS(xrep_inode_class, + TP_PROTO(struct xfs_scrub *sc), + TP_ARGS(sc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_rfsblock_t, nblocks) + __field(uint16_t, flags) + __field(uint64_t, flags2) + __field(uint32_t, nextents) + __field(uint8_t, format) + __field(uint32_t, anextents) + __field(uint8_t, aformat) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->size = sc->ip->i_disk_size; + __entry->nblocks = sc->ip->i_nblocks; + __entry->flags = sc->ip->i_diflags; + __entry->flags2 = sc->ip->i_diflags2; + __entry->nextents = sc->ip->i_df.if_nextents; + __entry->format = sc->ip->i_df.if_format; + __entry->anextents = sc->ip->i_af.if_nextents; + __entry->aformat = sc->ip->i_af.if_format; + ), + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0x%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->nblocks, + __entry->flags, + __entry->flags2, + __entry->nextents, + __entry->format, + __entry->anextents, + __entry->aformat) +) + +#define DEFINE_REPAIR_INODE_EVENT(name) \ +DEFINE_EVENT(xrep_inode_class, name, \ + TP_PROTO(struct xfs_scrub *sc), \ + TP_ARGS(sc)) +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed); + +TRACE_EVENT(xrep_dinode_count_rmaps, + TP_PROTO(struct xfs_scrub *sc, xfs_rfsblock_t data_blocks, + xfs_rfsblock_t rt_blocks, xfs_rfsblock_t attr_blocks, + xfs_extnum_t data_extents, xfs_extnum_t rt_extents, + xfs_aextnum_t attr_extents), + TP_ARGS(sc, data_blocks, rt_blocks, attr_blocks, data_extents, + rt_extents, attr_extents), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_rfsblock_t, data_blocks) + __field(xfs_rfsblock_t, rt_blocks) + __field(xfs_rfsblock_t, attr_blocks) + __field(xfs_extnum_t, data_extents) + __field(xfs_extnum_t, rt_extents) + __field(xfs_aextnum_t, attr_extents) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->data_blocks = data_blocks; + __entry->rt_blocks = rt_blocks; + __entry->attr_blocks = attr_blocks; + __entry->data_extents = data_extents; + __entry->rt_extents = rt_extents; + __entry->attr_extents = attr_extents; + ), + TP_printk("dev %d:%d ino 0x%llx dblocks 0x%llx rtblocks 0x%llx ablocks 0x%llx dextents %llu rtextents %llu aextents %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->data_blocks, + __entry->rt_blocks, + __entry->attr_blocks, + __entry->data_extents, + __entry->rt_extents, + __entry->attr_extents) +); + +TRACE_EVENT(xrep_cow_mark_file_range, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock, + xfs_fileoff_t startoff, xfs_filblks_t blockcount), + TP_ARGS(ip, startblock, startoff, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = startoff; + __entry->startblock = startblock; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount) +); + +TRACE_EVENT(xrep_cow_replace_mapping, + TP_PROTO(struct xfs_inode *ip, const struct xfs_bmbt_irec *irec, + xfs_fsblock_t new_startblock, xfs_extlen_t new_blockcount), + TP_ARGS(ip, irec, new_startblock, new_blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(xfs_fsblock_t, new_startblock) + __field(xfs_extlen_t, new_blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = irec->br_startoff; + __entry->startblock = irec->br_startblock; + __entry->blockcount = irec->br_blockcount; + __entry->state = irec->br_state; + __entry->new_startblock = new_startblock; + __entry->new_blockcount = new_blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx startoff 0x%llx startblock 0x%llx fsbcount 0x%llx state 0x%x new_startblock 0x%llx new_fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->new_startblock, + __entry->new_blockcount) +); + +TRACE_EVENT(xrep_cow_free_staging, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t blockcount), + TP_ARGS(pag, agbno, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, blockcount) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->blockcount) +); + +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xrep_dquot_class, + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), + TP_ARGS(mp, type, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint8_t, type) + __field(uint32_t, id) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->id = id; + __entry->type = type; + ), + TP_printk("dev %d:%d type %s id 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __entry->id) +); + +#define DEFINE_XREP_DQUOT_EVENT(name) \ +DEFINE_EVENT(xrep_dquot_class, name, \ + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), \ + TP_ARGS(mp, type, id)) +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item); +DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot); +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole); +#endif /* CONFIG_XFS_QUOTA */ + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index 4ecac01363d9..62b9c506fdd1 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -54,6 +54,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr) uint64_t xfarray_length(struct xfarray *array); int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec); +/* + * Iterate the non-null elements in a sparse xfarray. Callers should + * initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it + * will be set to one more than the index of the record that was retrieved. + * Returns 1 if a record was retrieved, 0 if there weren't any more records, or + * a negative errno. + */ +static inline int +xfarray_iter( + struct xfarray *array, + xfarray_idx_t *idx, + void *rec) +{ + int ret = xfarray_load_next(array, idx, rec); + + if (ret == -ENODATA) + return 0; + if (ret == 0) + return 1; + return ret; +} + /* Declarations for xfile array sort functionality. */ typedef cmp_func_t xfarray_cmp_fn; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 36fe2abb16e6..9e02111bd890 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -33,8 +33,6 @@ struct kmem_cache *xfs_attrd_cache; static const struct xfs_item_ops xfs_attri_item_ops; static const struct xfs_item_ops xfs_attrd_item_ops; -static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, - struct xfs_attri_log_item *attrip); static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) { @@ -310,47 +308,6 @@ xfs_attrd_item_intent( return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; } -/* - * Performs one step of an attribute update intent and marks the attrd item - * dirty.. An attr operation may be a set or a remove. Note that the - * transaction is marked dirty regardless of whether the operation succeeds or - * fails to support the ATTRI/ATTRD lifecycle rules. - */ -STATIC int -xfs_xattri_finish_update( - struct xfs_attr_intent *attr, - struct xfs_attrd_log_item *attrdp) -{ - struct xfs_da_args *args = attr->xattri_da_args; - int error; - - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { - error = -EIO; - goto out; - } - - error = xfs_attr_set_iter(attr); - if (!error && attr->xattri_dela_state != XFS_DAS_DONE) - error = -EAGAIN; -out: - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the ATTRI and frees the ATTRD - * 2.) shuts down the filesystem - */ - args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - - /* - * attr intent/done items are null when logged attributes are disabled - */ - if (attrdp) - set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - - return error; -} - /* Log an attr to the intent item. */ STATIC void xfs_attr_log_item( @@ -360,9 +317,6 @@ xfs_attr_log_item( { struct xfs_attri_log_format *attrp; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); - /* * At this point the xfs_attr_intent has been constructed, and we've * created the log intent. Fill in the attri log item and log format @@ -419,7 +373,6 @@ xfs_attr_create_intent( } attrip = xfs_attri_init(mp, attr->xattri_nameval); - xfs_trans_add_item(tp, &attrip->attri_item); xfs_attr_log_item(tp, attrip, attr); return &attrip->attri_item; @@ -447,23 +400,33 @@ xfs_attr_finish_item( struct xfs_btree_cur **state) { struct xfs_attr_intent *attr; - struct xfs_attrd_log_item *done_item = NULL; + struct xfs_da_args *args; int error; attr = container_of(item, struct xfs_attr_intent, xattri_list); - if (done) - done_item = ATTRD_ITEM(done); + args = attr->xattri_da_args; - /* - * Always reset trans after EAGAIN cycle - * since the transaction is new - */ - attr->xattri_da_args->trans = tp; + /* Reset trans after EAGAIN cycle since the transaction is new */ + args->trans = tp; - error = xfs_xattri_finish_update(attr, done_item); - if (error != -EAGAIN) - xfs_attr_free_item(attr); + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + error = -EIO; + goto out; + } + /* If an attr removal is trivially complete, we're done. */ + if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && + !xfs_inode_hasattr(args->dp)) { + error = 0; + goto out; + } + + error = xfs_attr_set_iter(attr); + if (!error && attr->xattri_dela_state != XFS_DAS_DONE) + return -EAGAIN; + +out: + xfs_attr_free_item(attr); return error; } @@ -532,41 +495,22 @@ xfs_attri_validate( return xfs_verify_ino(mp, attrp->alfi_ino); } -/* - * Process an attr intent item that was recovered from the log. We need to - * delete the attr that it describes. - */ -STATIC int -xfs_attri_item_recover( - struct xfs_log_item *lip, - struct list_head *capture_list) +static inline struct xfs_attr_intent * +xfs_attri_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_attri_log_format *attrp, + struct xfs_inode **ipp, + struct xfs_attri_log_nameval *nv) { - struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_attr_intent *attr; - struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_inode *ip; struct xfs_da_args *args; - struct xfs_trans *tp; - struct xfs_trans_res resv; - struct xfs_attri_log_format *attrp; - struct xfs_attri_log_nameval *nv = attrip->attri_nameval; - int error; - int total; int local; - struct xfs_attrd_log_item *done_item = NULL; - - /* - * First check the validity of the attr described by the ATTRI. If any - * are bad, then assume that all are bad and just toss the ATTRI. - */ - attrp = &attrip->attri_format; - if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) - return -EFSCORRUPTED; + int error; - error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + error = xlog_recover_iget(mp, attrp->alfi_ino, ipp); if (error) - return error; + return ERR_PTR(error); attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), KM_NOFS); @@ -584,7 +528,7 @@ xfs_attri_item_recover( attr->xattri_nameval = xfs_attri_log_nameval_get(nv); ASSERT(attr->xattri_nameval); - args->dp = ip; + args->dp = *ipp; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; args->name = nv->name.i_addr; @@ -608,43 +552,65 @@ xfs_attri_item_recover( attr->xattri_dela_state = xfs_attr_init_add_state(args); break; case XFS_ATTRI_OP_FLAGS_REMOVE: - if (!xfs_inode_hasattr(args->dp)) - goto out; attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - goto out; } + xfs_defer_add_item(dfp, &attr->xattri_list); + return attr; +} + +/* + * Process an attr intent item that was recovered from the log. We need to + * delete the attr that it describes. + */ +STATIC int +xfs_attr_recover_work( + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + struct xfs_log_item *lip = dfp->dfp_intent; + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attr_intent *attr; + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_inode *ip; + struct xfs_da_args *args; + struct xfs_trans *tp; + struct xfs_trans_res resv; + struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + int error; + int total; + + /* + * First check the validity of the attr described by the ATTRI. If any + * are bad, then assume that all are bad and just toss the ATTRI. + */ + attrp = &attrip->attri_format; + if (!xfs_attri_validate(mp, attrp) || + !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) + return -EFSCORRUPTED; + + attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv); + if (IS_ERR(attr)) + return PTR_ERR(attr); + args = attr->xattri_da_args; + xfs_init_attr_trans(args, &resv, &total); resv = xlog_recover_resv(&resv); error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) - goto out; - + return error; args->trans = tp; - done_item = xfs_trans_get_attrd(tp, attrip); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_xattri_finish_update(attr, done_item); - if (error == -EAGAIN) { - /* - * There's more work to do, so add the intent item to this - * transaction so that we can continue it later. - */ - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); - error = xfs_defer_ops_capture_and_commit(tp, capture_list); - if (error) - goto out_unlock; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - return 0; - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &attrip->attri_format, + sizeof(attrip->attri_format)); if (error) { xfs_trans_cancel(tp); goto out_unlock; @@ -654,18 +620,16 @@ xfs_attri_item_recover( out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); -out: - xfs_attr_free_item(attr); return error; } /* Re-log an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_attri_item_relog( +xfs_attr_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_attrd_log_item *attrdp; struct xfs_attri_log_item *old_attrip; struct xfs_attri_log_item *new_attrip; struct xfs_attri_log_format *new_attrp; @@ -674,10 +638,6 @@ xfs_attri_item_relog( old_attrip = ATTRI_ITEM(intent); old_attrp = &old_attrip->attri_format; - tp->t_flags |= XFS_TRANS_DIRTY; - attrdp = xfs_trans_get_attrd(tp, old_attrip); - set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - /* * Create a new log item that shares the same name/value buffer as the * old log item. @@ -691,12 +651,43 @@ xfs_attri_item_relog( new_attrp->alfi_name_len = old_attrp->alfi_name_len; new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; - xfs_trans_add_item(tp, &new_attrip->attri_item); - set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); - return &new_attrip->attri_item; } +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + struct xfs_attri_log_item *attrip; + struct xfs_attrd_log_item *attrdp; + + attrip = ATTRI_ITEM(intent); + + attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); + + xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, + &xfs_attrd_item_ops); + attrdp->attrd_attrip = attrip; + attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; + + return &attrdp->attrd_item; +} + +const struct xfs_defer_op_type xfs_attr_defer_type = { + .name = "attr", + .max_items = 1, + .create_intent = xfs_attr_create_intent, + .abort_intent = xfs_attr_abort_intent, + .create_done = xfs_attr_create_done, + .finish_item = xfs_attr_finish_item, + .cancel_item = xfs_attr_cancel_item, + .recover_work = xfs_attr_recover_work, + .relog_intent = xfs_attr_relog_intent, +}; + STATIC int xlog_recover_attri_commit_pass2( struct xlog *log, @@ -767,63 +758,13 @@ xlog_recover_attri_commit_pass2( attrip = xfs_attri_init(mp, nv); memcpy(&attrip->attri_format, attri_formatp, len); - /* - * The ATTRI has two references. One for the ATTRD and one for ATTRI to - * ensure it makes it into the AIL. Insert the ATTRI into the AIL - * directly and drop the ATTRI reference. Note that - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); - xfs_attri_release(attrip); + xlog_recover_intent_item(log, &attrip->attri_item, lsn, + &xfs_attr_defer_type); xfs_attri_log_nameval_put(nv); return 0; } /* - * This routine is called to allocate an "attr free done" log item. - */ -static struct xfs_attrd_log_item * -xfs_trans_get_attrd(struct xfs_trans *tp, - struct xfs_attri_log_item *attrip) -{ - struct xfs_attrd_log_item *attrdp; - - ASSERT(tp != NULL); - - attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); - - xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, - &xfs_attrd_item_ops); - attrdp->attrd_attrip = attrip; - attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; - - xfs_trans_add_item(tp, &attrdp->attrd_item); - return attrdp; -} - -/* Get an ATTRD so we can process all the attrs. */ -static struct xfs_log_item * -xfs_attr_create_done( - struct xfs_trans *tp, - struct xfs_log_item *intent, - unsigned int count) -{ - if (!intent) - return NULL; - - return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item; -} - -const struct xfs_defer_op_type xfs_attr_defer_type = { - .max_items = 1, - .create_intent = xfs_attr_create_intent, - .abort_intent = xfs_attr_abort_intent, - .create_done = xfs_attr_create_done, - .finish_item = xfs_attr_finish_item, - .cancel_item = xfs_attr_cancel_item, -}; - -/* * This routine is called when an ATTRD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding ATTRI if * it was still in the log. To do this it searches the AIL for the ATTRI with @@ -857,9 +798,7 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_format = xfs_attri_item_format, .iop_unpin = xfs_attri_item_unpin, .iop_release = xfs_attri_item_release, - .iop_recover = xfs_attri_item_recover, .iop_match = xfs_attri_item_match, - .iop_relog = xfs_attri_item_relog, }; const struct xlog_recover_item_ops xlog_attri_item_ops = { diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 99bbbe1a0e44..e368ad671e26 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -56,14 +56,13 @@ xfs_attr_shortform_list( struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_inode *dp = context->dp; struct xfs_attr_sf_sort *sbuf, *sbp; - struct xfs_attr_shortform *sf; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; int sbsize, nsbuf, count, i; int error = 0; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; ASSERT(sf != NULL); - if (!sf->hdr.count) + if (!sf->count) return 0; trace_xfs_attr_list_sf(context); @@ -79,8 +78,8 @@ xfs_attr_shortform_list( */ if (context->bufsize == 0 || (XFS_ISRESET_CURSOR(cursor) && - (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) { - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + (dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) { + for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, !xfs_attr_namecheck(sfe->nameval, sfe->namelen))) @@ -109,7 +108,7 @@ xfs_attr_shortform_list( /* * It didn't all fit, so we have to sort everything on hashval. */ - sbsize = sf->hdr.count * sizeof(*sbuf); + sbsize = sf->count * sizeof(*sbuf); sbp = sbuf = kmem_alloc(sbsize, KM_NOFS); /* @@ -117,7 +116,7 @@ xfs_attr_shortform_list( * the relevant info from only those that match into a buffer. */ nsbuf = 0; - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (unlikely( ((char *)sfe < (char *)sf) || ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e736a0844c89..52fb8a148b7d 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -221,51 +221,6 @@ static const struct xfs_item_ops xfs_bud_item_ops = { .iop_intent = xfs_bud_item_intent, }; -static struct xfs_bud_log_item * -xfs_trans_get_bud( - struct xfs_trans *tp, - struct xfs_bui_log_item *buip) -{ - struct xfs_bud_log_item *budp; - - budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, - &xfs_bud_item_ops); - budp->bud_buip = buip; - budp->bud_format.bud_bui_id = buip->bui_format.bui_id; - - xfs_trans_add_item(tp, &budp->bud_item); - return budp; -} - -/* - * Finish an bmap update and log it to the BUD. Note that the - * transaction is marked dirty regardless of whether the bmap update - * succeeds or fails to support the BUI/BUD lifecycle rules. - */ -static int -xfs_trans_log_finish_bmap_update( - struct xfs_trans *tp, - struct xfs_bud_log_item *budp, - struct xfs_bmap_intent *bi) -{ - int error; - - error = xfs_bmap_finish_one(tp, bi); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the BUI and frees the BUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - - return error; -} - /* Sort bmap intents by inode. */ static int xfs_bmap_update_diff_items( @@ -314,9 +269,6 @@ xfs_bmap_update_log_item( uint next_extent; struct xfs_map_extent *map; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -346,7 +298,6 @@ xfs_bmap_update_create_intent( ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - xfs_trans_add_item(tp, &buip->bui_item); if (sort) list_sort(mp, items, xfs_bmap_update_diff_items); list_for_each_entry(bi, items, bi_list) @@ -354,14 +305,23 @@ xfs_bmap_update_create_intent( return &buip->bui_item; } -/* Get an BUD so we can process all the deferred rmap updates. */ +/* Get an BUD so we can process all the deferred bmap updates. */ static struct xfs_log_item * xfs_bmap_update_create_done( struct xfs_trans *tp, struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; + struct xfs_bui_log_item *buip = BUI_ITEM(intent); + struct xfs_bud_log_item *budp; + + budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, + &xfs_bud_item_ops); + budp->bud_buip = buip; + budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + + return &budp->bud_item; } /* Take a passive ref to the AG containing the space we're mapping. */ @@ -392,7 +352,7 @@ xfs_bmap_update_put_group( xfs_perag_intent_put(bi->bi_pag); } -/* Process a deferred rmap update. */ +/* Process a deferred bmap update. */ STATIC int xfs_bmap_update_finish_item( struct xfs_trans *tp, @@ -405,7 +365,7 @@ xfs_bmap_update_finish_item( bi = container_of(item, struct xfs_bmap_intent, bi_list); - error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi); + error = xfs_bmap_finish_one(tp, bi); if (!error && bi->bi_bmap.br_blockcount > 0) { ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; @@ -437,15 +397,6 @@ xfs_bmap_update_cancel_item( kmem_cache_free(xfs_bmap_intent_cache, bi); } -const struct xfs_defer_op_type xfs_bmap_update_defer_type = { - .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .create_intent = xfs_bmap_update_create_intent, - .abort_intent = xfs_bmap_update_abort_intent, - .create_done = xfs_bmap_update_create_done, - .finish_item = xfs_bmap_update_finish_item, - .cancel_item = xfs_bmap_update_cancel_item, -}; - /* Is this recovered BUI ok? */ static inline bool xfs_bui_validate( @@ -480,23 +431,53 @@ xfs_bui_validate( return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } +static inline struct xfs_bmap_intent * +xfs_bui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_inode **ipp, + struct xfs_map_extent *map) +{ + struct xfs_bmap_intent *bi; + int error; + + error = xlog_recover_iget(mp, map->me_owner, ipp); + if (error) + return ERR_PTR(error); + + bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + bi->bi_bmap.br_startblock = map->me_startblock; + bi->bi_bmap.br_startoff = map->me_startoff; + bi->bi_bmap.br_blockcount = map->me_len; + bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + bi->bi_owner = *ipp; + xfs_bmap_update_get_group(mp, bi); + + xfs_defer_add_item(dfp, &bi->bi_list); + return bi; +} + /* * Process a bmap update intent item that was recovered from the log. * We need to update some inode's bmbt. */ STATIC int -xfs_bui_item_recover( - struct xfs_log_item *lip, +xfs_bmap_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { - struct xfs_bmap_intent fake = { }; struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *map; - struct xfs_bud_log_item *budp; + struct xfs_bmap_intent *work; int iext_delta; int error = 0; @@ -507,13 +488,9 @@ xfs_bui_item_recover( } map = &buip->bui_format.bui_extents[0]; - fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - - error = xlog_recover_iget(mp, map->me_owner, &ip); - if (error) - return error; + work = xfs_bui_recover_work(mp, dfp, &ip, map); + if (IS_ERR(work)) + return PTR_ERR(work); /* Allocate transaction and do the work. */ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -522,42 +499,27 @@ xfs_bui_item_recover( if (error) goto err_rele; - budp = xfs_trans_get_bud(tp, buip); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (fake.bi_type == XFS_BMAP_MAP) + if (work->bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta); + error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; - fake.bi_owner = ip; - fake.bi_bmap.br_startblock = map->me_startblock; - fake.bi_bmap.br_startoff = map->me_startoff; - fake.bi_bmap.br_blockcount = map->me_len; - fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - - xfs_bmap_update_get_group(mp, &fake); - error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); + error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, - sizeof(*map)); - xfs_bmap_update_put_group(&fake); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &buip->bui_format, sizeof(buip->bui_format)); if (error) goto err_cancel; - if (fake.bi_bmap.br_blockcount > 0) { - ASSERT(fake.bi_type == XFS_BMAP_UNMAP); - xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap); - } - /* * Commit transaction, which frees the transaction and saves the inode * for later replay activities. @@ -579,21 +541,13 @@ err_rele: return error; } -STATIC bool -xfs_bui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return BUI_ITEM(lip)->bui_format.bui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_bui_item_relog( +xfs_bmap_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_bud_log_item *budp; struct xfs_bui_log_item *buip; struct xfs_map_extent *map; unsigned int count; @@ -601,27 +555,40 @@ xfs_bui_item_relog( count = BUI_ITEM(intent)->bui_format.bui_nextents; map = BUI_ITEM(intent)->bui_format.bui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - buip = xfs_bui_init(tp->t_mountp); memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); - xfs_trans_add_item(tp, &buip->bui_item); - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + return &buip->bui_item; } +const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .name = "bmap", + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, + .recover_work = xfs_bmap_recover_work, + .relog_intent = xfs_bmap_relog_intent, +}; + +STATIC bool +xfs_bui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return BUI_ITEM(lip)->bui_format.bui_id == intent_id; +} + static const struct xfs_item_ops xfs_bui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, .iop_unpin = xfs_bui_item_unpin, .iop_release = xfs_bui_item_release, - .iop_recover = xfs_bui_item_recover, .iop_match = xfs_bui_item_match, - .iop_relog = xfs_bui_item_relog, }; static inline void @@ -681,12 +648,9 @@ xlog_recover_bui_commit_pass2( buip = xfs_bui_init(mp); xfs_bui_copy_format(&buip->bui_format, bui_formatp); atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn); - xfs_bui_release(buip); + + xlog_recover_intent_item(log, &buip->bui_item, lsn, + &xfs_bmap_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 731260a5af6d..c2531c28905c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -69,147 +69,6 @@ xfs_zero_extent( GFP_NOFS, 0); } -#ifdef CONFIG_XFS_RT -int -xfs_bmap_rtalloc( - struct xfs_bmalloca *ap) -{ - struct xfs_mount *mp = ap->ip->i_mount; - xfs_fileoff_t orig_offset = ap->offset; - xfs_rtxnum_t rtx; - xfs_rtxlen_t prod = 0; /* product factor for allocators */ - xfs_extlen_t mod = 0; /* product factor for allocators */ - xfs_rtxlen_t ralen = 0; /* realtime allocation length */ - xfs_extlen_t align; /* minimum allocation alignment */ - xfs_extlen_t orig_length = ap->length; - xfs_extlen_t minlen = mp->m_sb.sb_rextsize; - xfs_rtxlen_t raminlen; - bool rtlocked = false; - bool ignore_locality = false; - int error; - - align = xfs_get_extsz_hint(ap->ip); -retry: - prod = xfs_extlen_to_rtxlen(mp, align); - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 1, ap->eof, 0, - ap->conv, &ap->offset, &ap->length); - if (error) - return error; - ASSERT(ap->length); - ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0); - - /* - * If we shifted the file offset downward to satisfy an extent size - * hint, increase minlen by that amount so that the allocator won't - * give us an allocation that's too short to cover at least one of the - * blocks that the caller asked for. - */ - if (ap->offset != orig_offset) - minlen += orig_offset - ap->offset; - - /* - * If the offset & length are not perfectly aligned - * then kill prod, it will just get us in trouble. - */ - div_u64_rem(ap->offset, align, &mod); - if (mod || ap->length % align) - prod = 1; - /* - * Set ralen to be the actual requested length in rtextents. - * - * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that - * we rounded up to it, cut it back so it's valid again. - * Note that if it's a really large request (bigger than - * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't - * adjust the starting point to match it. - */ - ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); - - /* - * Lock out modifications to both the RT bitmap and summary inodes - */ - if (!rtlocked) { - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); - rtlocked = true; - } - - /* - * If it's an allocation to an empty file at offset 0, - * pick an extent that will space things out in the rt area. - */ - if (ap->eof && ap->offset == 0) { - error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); - if (error) - return error; - ap->blkno = xfs_rtx_to_rtb(mp, rtx); - } else { - ap->blkno = 0; - } - - xfs_bmap_adjacent(ap); - - /* - * Realtime allocation, done through xfs_rtallocate_extent. - */ - if (ignore_locality) - rtx = 0; - else - rtx = xfs_rtb_to_rtx(mp, ap->blkno); - raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); - error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen, - ap->wasdel, prod, &rtx); - if (error) - return error; - - if (rtx != NULLRTEXTNO) { - ap->blkno = xfs_rtx_to_rtb(mp, rtx); - ap->length = xfs_rtxlen_to_extlen(mp, ralen); - ap->ip->i_nblocks += ap->length; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) - ap->ip->i_delayed_blks -= ap->length; - /* - * Adjust the disk quota also. This was reserved - * earlier. - */ - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : - XFS_TRANS_DQ_RTBCOUNT, ap->length); - return 0; - } - - if (align > mp->m_sb.sb_rextsize) { - /* - * We previously enlarged the request length to try to satisfy - * an extent size hint. The allocator didn't return anything, - * so reset the parameters to the original values and try again - * without alignment criteria. - */ - ap->offset = orig_offset; - ap->length = orig_length; - minlen = align = mp->m_sb.sb_rextsize; - goto retry; - } - - if (!ignore_locality && ap->blkno != 0) { - /* - * If we can't allocate near a specific rt extent, try again - * without locality criteria. - */ - ignore_locality = true; - goto retry; - } - - ap->blkno = NULLFSBLOCK; - ap->length = 0; - return 0; -} -#endif /* CONFIG_XFS_RT */ - /* * Extent tree block counting routines. */ diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 6888078f5c31..77ecbb753ef2 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -47,7 +47,7 @@ int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, int rt, int eof, int delay, int convert, xfs_fileoff_t *offp, xfs_extlen_t *lenp); -void xfs_bmap_adjacent(struct xfs_bmalloca *ap); +bool xfs_bmap_adjacent(struct xfs_bmalloca *ap); int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *rec, int *is_empty); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 669332849680..8e5bd50d29fe 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2049,6 +2049,14 @@ error_free: return NULL; } +static inline void +xfs_buf_list_del( + struct xfs_buf *bp) +{ + list_del_init(&bp->b_list); + wake_up_var(&bp->b_list); +} + /* * Cancel a delayed write list. * @@ -2066,7 +2074,7 @@ xfs_buf_delwri_cancel( xfs_buf_lock(bp); bp->b_flags &= ~_XBF_DELWRI_Q; - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); xfs_buf_relse(bp); } } @@ -2120,6 +2128,34 @@ xfs_buf_delwri_queue( } /* + * Queue a buffer to this delwri list as part of a data integrity operation. + * If the buffer is on any other delwri list, we'll wait for that to clear + * so that the caller can submit the buffer for IO and wait for the result. + * Callers must ensure the buffer is not already on the list. + */ +void +xfs_buf_delwri_queue_here( + struct xfs_buf *bp, + struct list_head *buffer_list) +{ + /* + * We need this buffer to end up on the /caller's/ delwri list, not any + * old list. This can happen if the buffer is marked stale (which + * clears DELWRI_Q) after the AIL queues the buffer to its list but + * before the AIL has a chance to submit the list. + */ + while (!list_empty(&bp->b_list)) { + xfs_buf_unlock(bp); + wait_var_event(&bp->b_list, list_empty(&bp->b_list)); + xfs_buf_lock(bp); + } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + + xfs_buf_delwri_queue(bp, buffer_list); +} + +/* * Compare function is more complex than it needs to be because * the return value is only 32 bits and we are doing comparisons * on 64 bit values @@ -2181,7 +2217,7 @@ xfs_buf_delwri_submit_buffers( * reference and remove it from the list here. */ if (!(bp->b_flags & _XBF_DELWRI_Q)) { - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); xfs_buf_relse(bp); continue; } @@ -2201,7 +2237,7 @@ xfs_buf_delwri_submit_buffers( list_move_tail(&bp->b_list, wait_list); } else { bp->b_flags |= XBF_ASYNC; - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); } __xfs_buf_submit(bp, false); } @@ -2255,7 +2291,7 @@ xfs_buf_delwri_submit( while (!list_empty(&wait_list)) { bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); /* * Wait on the locked buffer, check for errors and unlock and diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c86e16419656..b470de08a46c 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -319,6 +319,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 9f3ceb461515..cc6dc56f455d 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -18,6 +18,7 @@ #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_error.h" +#include "xfs_health.h" /* * Directory file type support functions @@ -51,7 +52,7 @@ xfs_dir2_sf_getdents( struct xfs_mount *mp = dp->i_mount; xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_dir2_dataptr_t dot_offset; xfs_dir2_dataptr_t dotdot_offset; xfs_ino_t ino; @@ -59,9 +60,7 @@ xfs_dir2_sf_getdents( ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); /* * If the block number in the offset is out of range, we're done. @@ -519,6 +518,8 @@ xfs_readdir( if (xfs_is_shutdown(dp->i_mount)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 61a45a86ffe8..b4f20d9c8f98 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -172,14 +172,14 @@ xfs_qm_adjust_dqtimers( /* * initialize a buffer full of dquots and log the whole thing */ -STATIC void +void xfs_qm_init_dquot_blk( struct xfs_trans *tp, - struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_buf *bp) { + struct xfs_mount *mp = tp->t_mountp; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dqblk *d; xfs_dqid_t curid; @@ -353,7 +353,7 @@ xfs_dquot_disk_alloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp); + xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* @@ -1362,34 +1362,3 @@ xfs_qm_exit(void) kmem_cache_destroy(xfs_dqtrx_cache); kmem_cache_destroy(xfs_dquot_cache); } - -/* - * Iterate every dquot of a particular type. The caller must ensure that the - * particular quota type is active. iter_fn can return negative error codes, - * or -ECANCELED to indicate that it wants to stop iterating. - */ -int -xfs_qm_dqiterate( - struct xfs_mount *mp, - xfs_dqtype_t type, - xfs_qm_dqiterate_fn iter_fn, - void *priv) -{ - struct xfs_dquot *dq; - xfs_dqid_t id = 0; - int error; - - do { - error = xfs_qm_dqget_next(mp, id, type, &dq); - if (error == -ENOENT) - return 0; - if (error) - return error; - - error = iter_fn(dq, type, priv); - id = dq->q_id + 1; - xfs_qm_dqput(dq); - } while (error == 0 && id != 0); - - return error; -} diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 80c8f851a2f3..956272d9b302 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -234,12 +234,10 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } -typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, - xfs_dqtype_t type, void *priv); -int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, - xfs_qm_dqiterate_fn iter_fn, void *priv); - time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); time64_t xfs_dquot_set_grace_period(time64_t grace); +void xfs_qm_init_dquot_blk(struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t + type, struct xfs_buf *bp); + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 9ecfdcdc752f..2ccde32c9a9e 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -678,3 +678,16 @@ xfs_extent_busy_ag_cmp( diff = b1->bno - b2->bno; return diff; } + +/* Are there any busy extents in this AG? */ +bool +xfs_extent_busy_list_empty( + struct xfs_perag *pag) +{ + bool res; + + spin_lock(&pag->pagb_lock); + res = RB_EMPTY_ROOT(&pag->pagb_tree); + spin_unlock(&pag->pagb_lock); + return res; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 0639aab336f3..470032de3139 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -85,4 +85,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) list_sort(NULL, list, xfs_extent_busy_ag_cmp); } +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); + #endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3fa8789820ad..1d1185fca6a5 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -304,39 +304,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = { }; /* - * Allocate an "extent free done" log item that will hold nextents worth of - * extents. The caller must use all nextents extents, because we are not - * flexible about this at all. - */ -static struct xfs_efd_log_item * -xfs_trans_get_efd( - struct xfs_trans *tp, - struct xfs_efi_log_item *efip, - unsigned int nextents) -{ - struct xfs_efd_log_item *efdp; - - ASSERT(nextents > 0); - - if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { - efdp = kzalloc(xfs_efd_log_item_sizeof(nextents), - GFP_KERNEL | __GFP_NOFAIL); - } else { - efdp = kmem_cache_zalloc(xfs_efd_cache, - GFP_KERNEL | __GFP_NOFAIL); - } - - xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, - &xfs_efd_item_ops); - efdp->efd_efip = efip; - efdp->efd_format.efd_nextents = nextents; - efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; - - xfs_trans_add_item(tp, &efdp->efd_item); - return efdp; -} - -/* * Fill the EFD with all extents from the EFI when we need to roll the * transaction and continue with a new EFI. * @@ -364,69 +331,6 @@ xfs_efd_from_efi( efdp->efd_next_extent = efip->efi_format.efi_nextents; } -/* - * Free an extent and log it to the EFD. Note that the transaction is marked - * dirty regardless of whether the extent free succeeds or fails to support the - * EFI/EFD lifecycle rules. - */ -static int -xfs_trans_free_extent( - struct xfs_trans *tp, - struct xfs_efd_log_item *efdp, - struct xfs_extent_free_item *xefi) -{ - struct xfs_owner_info oinfo = { }; - struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent *extp; - uint next_extent; - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, - xefi->xefi_startblock); - int error; - - oinfo.oi_owner = xefi->xefi_owner; - if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - - trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, - agbno, xefi->xefi_blockcount); - - error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, - xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - - /* - * If we need a new transaction to make progress, the caller will log a - * new EFI with the current contents. It will also log an EFD to cancel - * the existing EFI, and so we need to copy all the unprocessed extents - * in this EFI to the EFD so this works correctly. - */ - if (error == -EAGAIN) { - xfs_efd_from_efi(efdp); - return error; - } - - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = xefi->xefi_startblock; - extp->ext_len = xefi->xefi_blockcount; - efdp->efd_next_extent++; - - return error; -} - /* Sort bmap items by AG. */ static int xfs_extent_free_diff_items( @@ -453,9 +357,6 @@ xfs_extent_free_log_item( uint next_extent; struct xfs_extent *extp; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -481,7 +382,6 @@ xfs_extent_free_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &efip->efi_item); if (sort) list_sort(mp, items, xfs_extent_free_diff_items); list_for_each_entry(xefi, items, xefi_list) @@ -496,7 +396,26 @@ xfs_extent_free_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; + struct xfs_efi_log_item *efip = EFI_ITEM(intent); + struct xfs_efd_log_item *efdp; + + ASSERT(count > 0); + + if (count > XFS_EFD_MAX_FAST_EXTENTS) { + efdp = kzalloc(xfs_efd_log_item_sizeof(count), + GFP_KERNEL | __GFP_NOFAIL); + } else { + efdp = kmem_cache_zalloc(xfs_efd_cache, + GFP_KERNEL | __GFP_NOFAIL); + } + + xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, + &xfs_efd_item_ops); + efdp->efd_efip = efip; + efdp->efd_format.efd_nextents = count; + efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + + return &efdp->efd_item; } /* Take a passive ref to the AG containing the space we're freeing. */ @@ -527,19 +446,49 @@ xfs_extent_free_finish_item( struct list_head *item, struct xfs_btree_cur **state) { + struct xfs_owner_info oinfo = { }; struct xfs_extent_free_item *xefi; - int error; + struct xfs_efd_log_item *efdp = EFD_ITEM(done); + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent *extp; + uint next_extent; + xfs_agblock_t agbno; + int error = 0; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); + + oinfo.oi_owner = xefi->xefi_owner; + if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, + agbno, xefi->xefi_blockcount); /* - * Don't free the XEFI if we need a new transaction to complete - * processing of it. + * If we need a new transaction to make progress, the caller will log a + * new EFI with the current contents. It will also log an EFD to cancel + * the existing EFI, and so we need to copy all the unprocessed extents + * in this EFI to the EFD so this works correctly. */ - if (error == -EAGAIN) + if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, + xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + if (error == -EAGAIN) { + xfs_efd_from_efi(efdp); return error; + } + + /* Add the work we finished to the EFD, even though nobody uses that */ + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; + efdp->efd_next_extent++; xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); @@ -567,15 +516,6 @@ xfs_extent_free_cancel_item( kmem_cache_free(xfs_extfree_item_cache, xefi); } -const struct xfs_defer_op_type xfs_extent_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_extent_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - /* * AGFL blocks are accounted differently in the reserve pools and are not * inserted into the busy extent list. @@ -610,16 +550,6 @@ xfs_agfl_free_finish_item( error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno, agbno, agbp, &oinfo); - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); @@ -632,16 +562,6 @@ xfs_agfl_free_finish_item( return error; } -/* sub-type with special handling for AGFL deferred frees */ -const struct xfs_defer_op_type xfs_agfl_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_agfl_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - /* Is this recovered EFI ok? */ static inline bool xfs_efi_validate_ext( @@ -651,23 +571,41 @@ xfs_efi_validate_ext( return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len); } +static inline void +xfs_efi_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_extent *extp) +{ + struct xfs_extent_free_item *xefi; + + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); + xefi->xefi_startblock = extp->ext_start; + xefi->xefi_blockcount = extp->ext_len; + xefi->xefi_agresv = XFS_AG_RESV_NONE; + xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN; + xfs_extent_free_get_group(mp, xefi); + + xfs_defer_add_item(dfp, &xefi->xefi_list); +} + /* * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. */ STATIC int -xfs_efi_item_recover( - struct xfs_log_item *lip, +xfs_extent_free_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_efd_log_item *efdp; struct xfs_trans *tp; int i; int error = 0; - bool requeue_only = false; /* * First check the validity of the extents described by the @@ -682,55 +620,22 @@ xfs_efi_item_recover( sizeof(efip->efi_format)); return -EFSCORRUPTED; } + + xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp); if (error) return error; - efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - - for (i = 0; i < efip->efi_format.efi_nextents; i++) { - struct xfs_extent_free_item fake = { - .xefi_owner = XFS_RMAP_OWN_UNKNOWN, - .xefi_agresv = XFS_AG_RESV_NONE, - }; - struct xfs_extent *extp; - extp = &efip->efi_format.efi_extents[i]; - - fake.xefi_startblock = extp->ext_start; - fake.xefi_blockcount = extp->ext_len; - - if (!requeue_only) { - xfs_extent_free_get_group(mp, &fake); - error = xfs_trans_free_extent(tp, efdp, &fake); - xfs_extent_free_put_group(&fake); - } - - /* - * If we can't free the extent without potentially deadlocking, - * requeue the rest of the extents to a new so that they get - * run again later with a new transaction context. - */ - if (error == -EAGAIN || requeue_only) { - error = xfs_free_extent_later(tp, fake.xefi_startblock, - fake.xefi_blockcount, - &XFS_RMAP_OINFO_ANY_OWNER, - fake.xefi_agresv); - if (!error) { - requeue_only = true; - continue; - } - } - - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - extp, sizeof(*extp)); - if (error) - goto abort_error; - - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &efip->efi_format, + sizeof(efip->efi_format)); + if (error) + goto abort_error; return xfs_defer_ops_capture_and_commit(tp, capture_list); @@ -739,21 +644,14 @@ abort_error: return error; } -STATIC bool -xfs_efi_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return EFI_ITEM(lip)->efi_format.efi_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_efi_item_relog( +xfs_extent_free_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_efd_log_item *efdp; + struct xfs_efd_log_item *efdp = EFD_ITEM(done_item); struct xfs_efi_log_item *efip; struct xfs_extent *extp; unsigned int count; @@ -761,29 +659,56 @@ xfs_efi_item_relog( count = EFI_ITEM(intent)->efi_format.efi_nextents; extp = EFI_ITEM(intent)->efi_format.efi_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count); efdp->efd_next_extent = count; memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); efip = xfs_efi_init(tp->t_mountp, count); memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); atomic_set(&efip->efi_next_extent, count); - xfs_trans_add_item(tp, &efip->efi_item); - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); + return &efip->efi_item; } +const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .name = "extent_free", + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_extent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; + +/* sub-type with special handling for AGFL deferred frees */ +const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .name = "agfl_free", + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; + +STATIC bool +xfs_efi_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return EFI_ITEM(lip)->efi_format.efi_id == intent_id; +} + static const struct xfs_item_ops xfs_efi_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_unpin = xfs_efi_item_unpin, .iop_release = xfs_efi_item_release, - .iop_recover = xfs_efi_item_recover, .iop_match = xfs_efi_item_match, - .iop_relog = xfs_efi_item_relog, }; /* @@ -820,12 +745,9 @@ xlog_recover_efi_commit_pass2( return error; } atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn); - xfs_efi_release(efip); + + xlog_recover_intent_item(log, &efip->efi_item, lsn, + &xfs_extent_free_defer_type); return 0; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 57076a25f17d..83f708f62ed9 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -134,6 +134,10 @@ xfs_growfs_data_private( if (delta < 0 && nagcount < 2) return -EINVAL; + /* No work to do */ + if (delta == 0) + return 0; + oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ if (nagcount > oagcount) { @@ -153,7 +157,7 @@ xfs_growfs_data_private( error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0, 0, &tp); if (error) - return error; + goto out_free_unused_perag; last_pag = xfs_perag_get(mp, oagcount - 1); if (delta > 0) { @@ -227,6 +231,9 @@ xfs_growfs_data_private( out_trans_cancel: xfs_trans_cancel(tp); +out_free_unused_perag: + if (nagcount > oagcount) + xfs_free_unused_perag_range(mp, oagcount, nagcount); return error; } @@ -344,59 +351,20 @@ xfs_growfs_log( } /* - * exported through ioctl XFS_IOC_FSCOUNTS - */ - -void -xfs_fs_counts( - xfs_mount_t *mp, - xfs_fsop_counts_t *cnt) -{ - cnt->allocino = percpu_counter_read_positive(&mp->m_icount); - cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); - cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp); - cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); -} - -/* - * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS - * - * xfs_reserve_blocks is called to set m_resblks - * in the in-core mount table. The number of unused reserved blocks - * is kept in m_resblks_avail. - * * Reserve the requested number of blocks if available. Otherwise return * as many as possible to satisfy the request. The actual number - * reserved are returned in outval - * - * A null inval pointer indicates that only the current reserved blocks - * available should be returned no settings are changed. + * reserved are returned in outval. */ - int xfs_reserve_blocks( - xfs_mount_t *mp, - uint64_t *inval, - xfs_fsop_resblks_t *outval) + struct xfs_mount *mp, + uint64_t request) { int64_t lcounter, delta; int64_t fdblks_delta = 0; - uint64_t request; int64_t free; int error = 0; - /* If inval is null, report current values and return */ - if (inval == (uint64_t *)NULL) { - if (!outval) - return -EINVAL; - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - return 0; - } - - request = *inval; - /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can @@ -466,11 +434,6 @@ xfs_reserve_blocks( spin_lock(&mp->m_sb_lock); } out: - if (outval) { - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - } - spin_unlock(&mp->m_sb_lock); return error; } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 2cffe51a31e8..44457b0a0593 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -6,14 +6,12 @@ #ifndef __XFS_FSOPS_H__ #define __XFS_FSOPS_H__ -extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); -extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); -extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, - xfs_fsop_resblks_t *outval); -extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); +int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); +int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); +int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); +int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); -extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); -extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); +int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); +int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 9edc1f2bc939..f18fec0adf66 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -44,4 +44,16 @@ struct xfs_globals xfs_globals = { .pwork_threads = -1, /* automatic thread detection */ .larp = false, /* log attribute replay */ #endif + + /* + * Leave this many record slots empty when bulk loading btrees. By + * default we load new btree leaf blocks 75% full. + */ + .bload_leaf_slack = -1, + + /* + * Leave this many key/ptr slots empty when bulk loading btrees. By + * default we load new btree node blocks 75% full. + */ + .bload_node_slack = -1, }; diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 72a075bb2c10..9a57afee9338 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -222,7 +222,7 @@ xfs_inode_mark_sick( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); trace_xfs_inode_mark_sick(ip, mask); spin_lock(&ip->i_flags_lock); @@ -246,7 +246,7 @@ xfs_inode_mark_healthy( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); trace_xfs_inode_mark_healthy(ip, mask); spin_lock(&ip->i_flags_lock); @@ -369,6 +369,10 @@ static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR }, { XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK }, { XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT }, + { XFS_SICK_INO_BMBTD_ZAPPED, XFS_BS_SICK_BMBTD }, + { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA }, + { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR }, + { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK }, { 0, 0 }, }; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c0f1c89786c2..1fd94958aa97 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -37,15 +37,10 @@ #include "xfs_reflink.h" #include "xfs_ag.h" #include "xfs_log_priv.h" +#include "xfs_health.h" struct kmem_cache *xfs_inode_cache; -/* - * Used in xfs_itruncate_extents(). This is the maximum number of extents - * freed from a file in a single transaction. - */ -#define XFS_ITRUNC_MAX_EXTENTS 2 - STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inode *); @@ -661,6 +656,8 @@ xfs_lookup( if (xfs_is_shutdown(dp->i_mount)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); if (error) @@ -875,7 +872,7 @@ xfs_init_new_inode( case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_bytes = 0; - ip->i_df.if_u1.if_root = NULL; + ip->i_df.if_data = NULL; break; default: ASSERT(0); @@ -978,6 +975,8 @@ xfs_create( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; prid = xfs_get_initial_prid(dp); @@ -1217,6 +1216,8 @@ xfs_link( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(tdp, XFS_DATA_FORK)) + return -EIO; error = xfs_qm_dqattach(sip); if (error) @@ -1339,7 +1340,6 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_filblks_t unmap_len; int error = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -1371,19 +1371,10 @@ xfs_itruncate_extents_flags( return 0; } - unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; - while (unmap_len > 0) { - ASSERT(tp->t_highest_agno == NULLAGNUMBER); - error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, - flags, XFS_ITRUNC_MAX_EXTENTS); - if (error) - goto out; - - /* free the just unmapped extents */ - error = xfs_defer_finish(&tp); - if (error) - goto out; - } + error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block, + XFS_MAX_FILEOFF); + if (error) + goto out; if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ @@ -2387,8 +2378,8 @@ xfs_ifree( * already been freed by xfs_attr_inactive. */ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - kmem_free(ip->i_df.if_u1.if_data); - ip->i_df.if_u1.if_data = NULL; + kmem_free(ip->i_df.if_data); + ip->i_df.if_data = NULL; ip->i_df.if_bytes = 0; } @@ -2506,6 +2497,8 @@ xfs_remove( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; error = xfs_qm_dqattach(dp); if (error) @@ -3758,3 +3751,29 @@ xfs_inode_reload_unlinked( return error; } + +/* Has this inode fork been zapped by repair? */ +bool +xfs_ifork_zapped( + const struct xfs_inode *ip, + int whichfork) +{ + unsigned int datamask = 0; + + switch (whichfork) { + case XFS_DATA_FORK: + switch (ip->i_vnode.i_mode & S_IFMT) { + case S_IFDIR: + datamask = XFS_SICK_INO_DIR_ZAPPED; + break; + case S_IFLNK: + datamask = XFS_SICK_INO_SYMLINK_ZAPPED; + break; + } + return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask); + case XFS_ATTR_FORK: + return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED; + default: + return false; + } +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3beb470f1892..97f63bacd4c2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -622,4 +622,6 @@ xfs_inode_unlinked_incomplete( int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_inode_reload_unlinked(struct xfs_inode *ip); +bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index cd7803fda8b1..0aee97ba0be8 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -352,11 +352,10 @@ xfs_inode_item_format_data_fork( ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_df.if_data != NULL); ASSERT(ip->i_disk_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, - ip->i_df.if_u1.if_data, - ip->i_df.if_bytes); + ip->i_df.if_data, ip->i_df.if_bytes); ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes; ilf->ilf_size++; } else { @@ -431,10 +430,9 @@ xfs_inode_item_format_attr_fork( if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_af.if_bytes > 0) { - ASSERT(ip->i_af.if_u1.if_data != NULL); + ASSERT(ip->i_af.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, - ip->i_af.if_u1.if_data, - ip->i_af.if_bytes); + ip->i_af.if_data, ip->i_af.if_bytes); ilf->ilf_asize = (unsigned)ip->i_af.if_bytes; ilf->ilf_size++; } else { @@ -557,6 +555,9 @@ xfs_inode_to_log_dinode( memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_v3_pad = 0; + + /* dummy value for initialisation */ + to->di_crc = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6c3919687ea6..f02b6e558af5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1872,6 +1872,63 @@ xfs_fs_eofblocks_from_user( return 0; } +static int +xfs_ioctl_getset_resblocks( + struct file *filp, + unsigned int cmd, + void __user *arg) +{ + struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount; + struct xfs_fsop_resblks fsop = { }; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (cmd == XFS_IOC_SET_RESBLKS) { + if (xfs_is_readonly(mp)) + return -EROFS; + + if (copy_from_user(&fsop, arg, sizeof(fsop))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_reserve_blocks(mp, fsop.resblks); + mnt_drop_write_file(filp); + if (error) + return error; + } + + spin_lock(&mp->m_sb_lock); + fsop.resblks = mp->m_resblks; + fsop.resblks_avail = mp->m_resblks_avail; + spin_unlock(&mp->m_sb_lock); + + if (copy_to_user(arg, &fsop, sizeof(fsop))) + return -EFAULT; + return 0; +} + +static int +xfs_ioctl_fs_counts( + struct xfs_mount *mp, + struct xfs_fsop_counts __user *uarg) +{ + struct xfs_fsop_counts out = { + .allocino = percpu_counter_read_positive(&mp->m_icount), + .freeino = percpu_counter_read_positive(&mp->m_ifree), + .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + xfs_fdblocks_unavailable(mp), + .freertx = percpu_counter_read_positive(&mp->m_frextents), + }; + + if (copy_to_user(uarg, &out, sizeof(out))) + return -EFAULT; + return 0; +} + /* * These long-unused ioctls were removed from the official ioctl API in 5.17, * but retain these definitions so that we can log warnings about them. @@ -2008,60 +2065,12 @@ xfs_file_ioctl( return error; } - case XFS_IOC_FSCOUNTS: { - xfs_fsop_counts_t out; + case XFS_IOC_FSCOUNTS: + return xfs_ioctl_fs_counts(mp, arg); - xfs_fs_counts(mp, &out); - - if (copy_to_user(arg, &out, sizeof(out))) - return -EFAULT; - return 0; - } - - case XFS_IOC_SET_RESBLKS: { - xfs_fsop_resblks_t inout; - uint64_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (xfs_is_readonly(mp)) - return -EROFS; - - if (copy_from_user(&inout, arg, sizeof(inout))) - return -EFAULT; - - error = mnt_want_write_file(filp); - if (error) - return error; - - /* input parameter is passed in resblks field of structure */ - in = inout.resblks; - error = xfs_reserve_blocks(mp, &in, &inout); - mnt_drop_write_file(filp); - if (error) - return error; - - if (copy_to_user(arg, &inout, sizeof(inout))) - return -EFAULT; - return 0; - } - - case XFS_IOC_GET_RESBLKS: { - xfs_fsop_resblks_t out; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_reserve_blocks(mp, NULL, &out); - if (error) - return error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -EFAULT; - - return 0; - } + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + return xfs_ioctl_getset_resblocks(filp, cmd, arg); case XFS_IOC_FSGROWFSDATA: { struct xfs_growfs_data in; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ee206facf0dc..a1650fc81382 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1542,6 +1542,7 @@ xlog_alloc_log( log->l_covered_state = XLOG_STATE_COVER_IDLE; set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); + INIT_LIST_HEAD(&log->r_dfops); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index fa3ad1d7b31c..e30c06ec20e3 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -407,6 +407,7 @@ struct xlog { long l_opstate; /* operational state */ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; + struct list_head r_dfops; /* recovered log intent items */ int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a1e18b24971a..1251c81e55f9 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1723,30 +1723,24 @@ xlog_clear_stale_blocks( */ void xlog_recover_release_intent( - struct xlog *log, - unsigned short intent_type, - uint64_t intent_id) + struct xlog *log, + unsigned short intent_type, + uint64_t intent_id) { - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp = log->l_ailp; + struct xfs_defer_pending *dfp, *n; + + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + struct xfs_log_item *lip = dfp->dfp_intent; - spin_lock(&ailp->ail_lock); - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { if (lip->li_type != intent_type) continue; if (!lip->li_ops->iop_match(lip, intent_id)) continue; - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - break; - } + ASSERT(xlog_item_is_intent(lip)); - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + xfs_defer_cancel_recovery(log->l_mp, dfp); + } } int @@ -1939,6 +1933,29 @@ xlog_buf_readahead( xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops); } +/* + * Create a deferred work structure for resuming and tracking the progress of a + * log intent item that was found during recovery. + */ +void +xlog_recover_intent_item( + struct xlog *log, + struct xfs_log_item *lip, + xfs_lsn_t lsn, + const struct xfs_defer_op_type *ops) +{ + ASSERT(xlog_item_is_intent(lip)); + + xfs_defer_start_recovery(lip, &log->r_dfops, ops); + + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, lip, lsn); + lip->li_ops->iop_unpin(lip, 0); +} + STATIC int xlog_recover_items_pass2( struct xlog *log, @@ -2533,36 +2550,26 @@ xlog_abort_defer_ops( */ STATIC int xlog_recover_process_intents( - struct xlog *log) + struct xlog *log) { LIST_HEAD(capture_list); - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp; - int error = 0; + struct xfs_defer_pending *dfp, *n; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) - xfs_lsn_t last_lsn; -#endif + xfs_lsn_t last_lsn; - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); -#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - const struct xfs_item_ops *ops; - if (!xlog_item_is_intent(lip)) - break; + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); /* * We should never see a redo item with a LSN higher than * the last transaction we found in the log at the start * of recovery. */ - ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); + ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0); /* * NOTE: If your intent processing routine can create more @@ -2571,21 +2578,14 @@ xlog_recover_process_intents( * replayed in the wrong order! * * The recovery function can free the log item, so we must not - * access lip after it returns. + * access dfp->dfp_intent after it returns. It must dispose of + * @dfp if it returns 0. */ - spin_unlock(&ailp->ail_lock); - ops = lip->li_ops; - error = ops->iop_recover(lip, &capture_list); - spin_lock(&ailp->ail_lock); - if (error) { - trace_xlog_intent_recovery_failed(log->l_mp, error, - ops->iop_recover); + error = xfs_defer_finish_recovery(log->l_mp, dfp, + &capture_list); + if (error) break; - } } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); if (error) goto err; @@ -2606,27 +2606,34 @@ err: */ STATIC void xlog_recover_cancel_intents( - struct xlog *log) + struct xlog *log) { - struct xfs_log_item *lip; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp; - - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (!xlog_item_is_intent(lip)) - break; + struct xfs_defer_pending *dfp, *n; - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_next(ailp, &cur); + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); + + xfs_defer_cancel_recovery(log->l_mp, dfp); } +} - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); +/* + * Transfer ownership of the recovered pending work to the recovery transaction + * and try to finish the work. If there is more work to be done, the dfp will + * remain attached to the transaction. If not, the dfp is freed. + */ +int +xlog_recover_finish_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + int error; + + list_move(&dfp->dfp_list, &tp->t_dfops); + error = xfs_defer_finish_one(tp, dfp); + if (error == -EAGAIN) + return 0; + return error; } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index aed5be5508fe..aabb25dc3efa 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -637,7 +637,6 @@ xfs_mountfs( struct xfs_sb *sbp = &(mp->m_sb); struct xfs_inode *rip; struct xfs_ino_geometry *igeo = M_IGEO(mp); - uint64_t resblks; uint quotamount = 0; uint quotaflags = 0; int error = 0; @@ -974,8 +973,7 @@ xfs_mountfs( * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { - resblks = xfs_default_resblks(mp); - error = xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); if (error) xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); @@ -1053,7 +1051,6 @@ void xfs_unmountfs( struct xfs_mount *mp) { - uint64_t resblks; int error; /* @@ -1090,8 +1087,7 @@ xfs_unmountfs( * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ - resblks = 0; - error = xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index a7daa522e00f..fa50e5308292 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/dax.h> +#include <linux/fs.h> struct xfs_failure_info { xfs_agblock_t startblock; @@ -73,10 +74,16 @@ xfs_dax_failure_fn( struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip; struct xfs_failure_info *notify = data; + struct address_space *mapping; + pgoff_t pgoff; + unsigned long pgcnt; int error = 0; if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { + /* Continue the query because this isn't a failure. */ + if (notify->mf_flags & MF_MEM_PRE_REMOVE) + return 0; notify->want_shutdown = true; return 0; } @@ -92,15 +99,61 @@ xfs_dax_failure_fn( return 0; } - error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, - xfs_failure_pgoff(mp, rec, notify), - xfs_failure_pgcnt(mp, rec, notify), - notify->mf_flags); + mapping = VFS_I(ip)->i_mapping; + pgoff = xfs_failure_pgoff(mp, rec, notify); + pgcnt = xfs_failure_pgcnt(mp, rec, notify); + + /* Continue the rmap query if the inode isn't a dax file. */ + if (dax_mapping(mapping)) + error = mf_dax_kill_procs(mapping, pgoff, pgcnt, + notify->mf_flags); + + /* Invalidate the cache in dax pages. */ + if (notify->mf_flags & MF_MEM_PRE_REMOVE) + invalidate_inode_pages2_range(mapping, pgoff, + pgoff + pgcnt - 1); + xfs_irele(ip); return error; } static int +xfs_dax_notify_failure_freeze( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + int error; + + error = freeze_super(sb, FREEZE_HOLDER_KERNEL); + if (error) + xfs_emerg(mp, "already frozen by kernel, err=%d", error); + + return error; +} + +static void +xfs_dax_notify_failure_thaw( + struct xfs_mount *mp, + bool kernel_frozen) +{ + struct super_block *sb = mp->m_super; + int error; + + if (kernel_frozen) { + error = thaw_super(sb, FREEZE_HOLDER_KERNEL); + if (error) + xfs_emerg(mp, "still frozen after notify failure, err=%d", + error); + } + + /* + * Also thaw userspace call anyway because the device is about to be + * removed immediately. + */ + thaw_super(sb, FREEZE_HOLDER_USERSPACE); +} + +static int xfs_dax_notify_ddev_failure( struct xfs_mount *mp, xfs_daddr_t daddr, @@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure( struct xfs_btree_cur *cur = NULL; struct xfs_buf *agf_bp = NULL; int error = 0; + bool kernel_frozen = false; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); + if (mf_flags & MF_MEM_PRE_REMOVE) { + xfs_info(mp, "Device is about to be removed!"); + /* + * Freeze fs to prevent new mappings from being created. + * - Keep going on if others already hold the kernel forzen. + * - Keep going on if other errors too because this device is + * starting to fail. + * - If kernel frozen state is hold successfully here, thaw it + * here as well at the end. + */ + kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; + } + error = xfs_trans_alloc_empty(mp, &tp); if (error) - return error; + goto out; for (; agno <= end_agno; agno++) { struct xfs_rmap_irec ri_low = { }; @@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure( } xfs_trans_cancel(tp); - if (error || notify.want_shutdown) { + + /* + * Shutdown fs from a force umount in pre-remove case which won't fail, + * so errors can be ignored. Otherwise, shutdown the filesystem with + * CORRUPT flag if error occured or notify.want_shutdown was set during + * RMAP querying. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + else if (error || notify.want_shutdown) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); if (!error) error = -EFSCORRUPTED; } + +out: + /* Thaw the fs if it has been frozen before. */ + if (mf_flags & MF_MEM_PRE_REMOVE) + xfs_dax_notify_failure_thaw(mp, kernel_frozen); + return error; } @@ -197,6 +279,14 @@ xfs_dax_notify_failure( if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && mp->m_logdev_targp != mp->m_ddev_targp) { + /* + * In the pre-remove case the failure notification is attempting + * to trigger a force unmount. The expectation is that the + * device is still present, but its removal is in progress and + * can not be cancelled, proceed with accessing the log device. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + return 0; xfs_err(mp, "ondisk log corrupt, shutting down fs!"); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); return -EFSCORRUPTED; @@ -210,6 +300,12 @@ xfs_dax_notify_failure( ddev_start = mp->m_ddev_targp->bt_dax_part_off; ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; + /* Notify failure on the whole device. */ + if (offset == 0 && len == U64_MAX) { + offset = ddev_start; + len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); + } + /* Ignore the range out of filesystem area */ if (offset + len - 1 < ddev_start) return -ENXIO; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index dcc785fdd345..e0d56489f3b2 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -127,7 +127,10 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, } #define xfs_trans_dup_dqinfo(tp, tp2) #define xfs_trans_free_dqinfo(tp) -#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0) +static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp, + struct xfs_inode *ip, uint field, int64_t delta) +{ +} #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 2d4444d61e98..20ad8086da60 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -227,52 +227,6 @@ static const struct xfs_item_ops xfs_cud_item_ops = { .iop_intent = xfs_cud_item_intent, }; -static struct xfs_cud_log_item * -xfs_trans_get_cud( - struct xfs_trans *tp, - struct xfs_cui_log_item *cuip) -{ - struct xfs_cud_log_item *cudp; - - cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, - &xfs_cud_item_ops); - cudp->cud_cuip = cuip; - cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; - - xfs_trans_add_item(tp, &cudp->cud_item); - return cudp; -} - -/* - * Finish an refcount update and log it to the CUD. Note that the - * transaction is marked dirty regardless of whether the refcount - * update succeeds or fails to support the CUI/CUD lifecycle rules. - */ -static int -xfs_trans_log_finish_refcount_update( - struct xfs_trans *tp, - struct xfs_cud_log_item *cudp, - struct xfs_refcount_intent *ri, - struct xfs_btree_cur **pcur) -{ - int error; - - error = xfs_refcount_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the CUI and frees the CUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - - return error; -} - /* Sort refcount intents by AG. */ static int xfs_refcount_update_diff_items( @@ -318,9 +272,6 @@ xfs_refcount_update_log_item( uint next_extent; struct xfs_phys_extent *pmap; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -347,7 +298,6 @@ xfs_refcount_update_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &cuip->cui_item); if (sort) list_sort(mp, items, xfs_refcount_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -362,7 +312,16 @@ xfs_refcount_update_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; + struct xfs_cui_log_item *cuip = CUI_ITEM(intent); + struct xfs_cud_log_item *cudp; + + cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, + &xfs_cud_item_ops); + cudp->cud_cuip = cuip; + cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + + return &cudp->cud_item; } /* Take a passive ref to the AG containing the space we're refcounting. */ @@ -397,10 +356,9 @@ xfs_refcount_update_finish_item( int error; ri = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri, - state); /* Did we run out of reservation? Requeue what we didn't finish. */ + error = xfs_refcount_finish_one(tp, ri, state); if (!error && ri->ri_blockcount > 0) { ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || ri->ri_type == XFS_REFCOUNT_DECREASE); @@ -433,16 +391,6 @@ xfs_refcount_update_cancel_item( kmem_cache_free(xfs_refcount_intent_cache, ri); } -const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .create_intent = xfs_refcount_update_create_intent, - .abort_intent = xfs_refcount_update_abort_intent, - .create_done = xfs_refcount_update_create_done, - .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_finish_one_cleanup, - .cancel_item = xfs_refcount_update_cancel_item, -}; - /* Is this recovered CUI ok? */ static inline bool xfs_cui_validate_phys( @@ -468,23 +416,38 @@ xfs_cui_validate_phys( return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); } +static inline void +xfs_cui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_phys_extent *pmap) +{ + struct xfs_refcount_intent *ri; + + ri = kmem_cache_alloc(xfs_refcount_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + ri->ri_startblock = pmap->pe_startblock; + ri->ri_blockcount = pmap->pe_len; + xfs_refcount_update_get_group(mp, ri); + + xfs_defer_add_item(dfp, &ri->ri_list); +} + /* * Process a refcount update intent item that was recovered from the log. * We need to update the refcountbt. */ STATIC int -xfs_cui_item_recover( - struct xfs_log_item *lip, +xfs_refcount_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); - struct xfs_cud_log_item *cudp; struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - unsigned int refc_type; - bool requeue_only = false; int i; int error = 0; @@ -501,6 +464,8 @@ xfs_cui_item_recover( sizeof(cuip->cui_format)); return -EFSCORRUPTED; } + + xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]); } /* @@ -521,100 +486,28 @@ xfs_cui_item_recover( if (error) return error; - cudp = xfs_trans_get_cud(tp, cuip); - - for (i = 0; i < cuip->cui_format.cui_nextents; i++) { - struct xfs_refcount_intent fake = { }; - struct xfs_phys_extent *pmap; - - pmap = &cuip->cui_format.cui_extents[i]; - refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; - switch (refc_type) { - case XFS_REFCOUNT_INCREASE: - case XFS_REFCOUNT_DECREASE: - case XFS_REFCOUNT_ALLOC_COW: - case XFS_REFCOUNT_FREE_COW: - fake.ri_type = refc_type; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, - sizeof(cuip->cui_format)); - error = -EFSCORRUPTED; - goto abort_error; - } - - fake.ri_startblock = pmap->pe_startblock; - fake.ri_blockcount = pmap->pe_len; - - if (!requeue_only) { - xfs_refcount_update_get_group(mp, &fake); - error = xfs_trans_log_finish_refcount_update(tp, cudp, - &fake, &rcur); - xfs_refcount_update_put_group(&fake); - } - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, - sizeof(cuip->cui_format)); - if (error) - goto abort_error; - - /* Requeue what we didn't finish. */ - if (fake.ri_blockcount > 0) { - struct xfs_bmbt_irec irec = { - .br_startblock = fake.ri_startblock, - .br_blockcount = fake.ri_blockcount, - }; - - switch (fake.ri_type) { - case XFS_REFCOUNT_INCREASE: - xfs_refcount_increase_extent(tp, &irec); - break; - case XFS_REFCOUNT_DECREASE: - xfs_refcount_decrease_extent(tp, &irec); - break; - case XFS_REFCOUNT_ALLOC_COW: - xfs_refcount_alloc_cow_extent(tp, - irec.br_startblock, - irec.br_blockcount); - break; - case XFS_REFCOUNT_FREE_COW: - xfs_refcount_free_cow_extent(tp, - irec.br_startblock, - irec.br_blockcount); - break; - default: - ASSERT(0); - } - requeue_only = true; - } - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &cuip->cui_format, + sizeof(cuip->cui_format)); + if (error) + goto abort_error; - xfs_refcount_finish_one_cleanup(tp, rcur, error); return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: - xfs_refcount_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } -STATIC bool -xfs_cui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return CUI_ITEM(lip)->cui_format.cui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_cui_item_relog( +xfs_refcount_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_cud_log_item *cudp; struct xfs_cui_log_item *cuip; struct xfs_phys_extent *pmap; unsigned int count; @@ -622,27 +515,41 @@ xfs_cui_item_relog( count = CUI_ITEM(intent)->cui_format.cui_nextents; pmap = CUI_ITEM(intent)->cui_format.cui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - cuip = xfs_cui_init(tp->t_mountp, count); memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); - xfs_trans_add_item(tp, &cuip->cui_item); - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); + return &cuip->cui_item; } +const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .name = "refcount", + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_finish_one_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, + .recover_work = xfs_refcount_recover_work, + .relog_intent = xfs_refcount_relog_intent, +}; + +STATIC bool +xfs_cui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return CUI_ITEM(lip)->cui_format.cui_id == intent_id; +} + static const struct xfs_item_ops xfs_cui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, .iop_unpin = xfs_cui_item_unpin, .iop_release = xfs_cui_item_release, - .iop_recover = xfs_cui_item_recover, .iop_match = xfs_cui_item_match, - .iop_relog = xfs_cui_item_relog, }; static inline void @@ -696,12 +603,9 @@ xlog_recover_cui_commit_pass2( cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn); - xfs_cui_release(cuip); + + xlog_recover_intent_item(log, &cuip->cui_item, lsn, + &xfs_refcount_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e5b62dc28466..d5ca8bcae65b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -618,7 +618,7 @@ xfs_reflink_cancel_cow_blocks( error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) break; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 0e0e747028da..79ad0087aeca 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -225,23 +225,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = { .iop_intent = xfs_rud_item_intent, }; -static struct xfs_rud_log_item * -xfs_trans_get_rud( - struct xfs_trans *tp, - struct xfs_rui_log_item *ruip) -{ - struct xfs_rud_log_item *rudp; - - rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, - &xfs_rud_item_ops); - rudp->rud_ruip = ruip; - rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; - - xfs_trans_add_item(tp, &rudp->rud_item); - return rudp; -} - /* Set the map extent flags for this reverse mapping. */ static void xfs_trans_set_rmap_flags( @@ -285,35 +268,6 @@ xfs_trans_set_rmap_flags( } } -/* - * Finish an rmap update and log it to the RUD. Note that the transaction is - * marked dirty regardless of whether the rmap update succeeds or fails to - * support the RUI/RUD lifecycle rules. - */ -static int -xfs_trans_log_finish_rmap_update( - struct xfs_trans *tp, - struct xfs_rud_log_item *rudp, - struct xfs_rmap_intent *ri, - struct xfs_btree_cur **pcur) -{ - int error; - - error = xfs_rmap_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the RUI and frees the RUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - - return error; -} - /* Sort rmap intents by AG. */ static int xfs_rmap_update_diff_items( @@ -340,9 +294,6 @@ xfs_rmap_update_log_item( uint next_extent; struct xfs_map_extent *map; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -372,7 +323,6 @@ xfs_rmap_update_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &ruip->rui_item); if (sort) list_sort(mp, items, xfs_rmap_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -387,7 +337,16 @@ xfs_rmap_update_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; + struct xfs_rui_log_item *ruip = RUI_ITEM(intent); + struct xfs_rud_log_item *rudp; + + rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, + &xfs_rud_item_ops); + rudp->rud_ruip = ruip; + rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; + + return &rudp->rud_item; } /* Take a passive ref to the AG containing the space we're rmapping. */ @@ -423,8 +382,7 @@ xfs_rmap_update_finish_item( ri = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, - state); + error = xfs_rmap_finish_one(tp, ri, state); xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); @@ -452,16 +410,6 @@ xfs_rmap_update_cancel_item( kmem_cache_free(xfs_rmap_intent_cache, ri); } -const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .create_intent = xfs_rmap_update_create_intent, - .abort_intent = xfs_rmap_update_abort_intent, - .create_done = xfs_rmap_update_create_done, - .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_finish_one_cleanup, - .cancel_item = xfs_rmap_update_cancel_item, -}; - /* Is this recovered RUI ok? */ static inline bool xfs_rui_validate_map( @@ -498,20 +446,72 @@ xfs_rui_validate_map( return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } +static inline void +xfs_rui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + const struct xfs_map_extent *map) +{ + struct xfs_rmap_intent *ri; + + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: + ri->ri_type = XFS_RMAP_MAP; + break; + case XFS_RMAP_EXTENT_MAP_SHARED: + ri->ri_type = XFS_RMAP_MAP_SHARED; + break; + case XFS_RMAP_EXTENT_UNMAP: + ri->ri_type = XFS_RMAP_UNMAP; + break; + case XFS_RMAP_EXTENT_UNMAP_SHARED: + ri->ri_type = XFS_RMAP_UNMAP_SHARED; + break; + case XFS_RMAP_EXTENT_CONVERT: + ri->ri_type = XFS_RMAP_CONVERT; + break; + case XFS_RMAP_EXTENT_CONVERT_SHARED: + ri->ri_type = XFS_RMAP_CONVERT_SHARED; + break; + case XFS_RMAP_EXTENT_ALLOC: + ri->ri_type = XFS_RMAP_ALLOC; + break; + case XFS_RMAP_EXTENT_FREE: + ri->ri_type = XFS_RMAP_FREE; + break; + default: + ASSERT(0); + return; + } + + ri->ri_owner = map->me_owner; + ri->ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + ri->ri_bmap.br_startblock = map->me_startblock; + ri->ri_bmap.br_startoff = map->me_startoff; + ri->ri_bmap.br_blockcount = map->me_len; + ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_rmap_update_get_group(mp, ri); + + xfs_defer_add_item(dfp, &ri->ri_list); +} + /* * Process an rmap update intent item that was recovered from the log. * We need to update the rmapbt. */ STATIC int -xfs_rui_item_recover( - struct xfs_log_item *lip, +xfs_rmap_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); - struct xfs_rud_log_item *rudp; struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; int i; int error = 0; @@ -529,6 +529,8 @@ xfs_rui_item_recover( sizeof(ruip->rui_format)); return -EFSCORRUPTED; } + + xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -536,91 +538,29 @@ xfs_rui_item_recover( XFS_TRANS_RESERVE, &tp); if (error) return error; - rudp = xfs_trans_get_rud(tp, ruip); - for (i = 0; i < ruip->rui_format.rui_nextents; i++) { - struct xfs_rmap_intent fake = { }; - struct xfs_map_extent *map; - - map = &ruip->rui_format.rui_extents[i]; - switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { - case XFS_RMAP_EXTENT_MAP: - fake.ri_type = XFS_RMAP_MAP; - break; - case XFS_RMAP_EXTENT_MAP_SHARED: - fake.ri_type = XFS_RMAP_MAP_SHARED; - break; - case XFS_RMAP_EXTENT_UNMAP: - fake.ri_type = XFS_RMAP_UNMAP; - break; - case XFS_RMAP_EXTENT_UNMAP_SHARED: - fake.ri_type = XFS_RMAP_UNMAP_SHARED; - break; - case XFS_RMAP_EXTENT_CONVERT: - fake.ri_type = XFS_RMAP_CONVERT; - break; - case XFS_RMAP_EXTENT_CONVERT_SHARED: - fake.ri_type = XFS_RMAP_CONVERT_SHARED; - break; - case XFS_RMAP_EXTENT_ALLOC: - fake.ri_type = XFS_RMAP_ALLOC; - break; - case XFS_RMAP_EXTENT_FREE: - fake.ri_type = XFS_RMAP_FREE; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &ruip->rui_format, - sizeof(ruip->rui_format)); - error = -EFSCORRUPTED; - goto abort_error; - } - - fake.ri_owner = map->me_owner; - fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - fake.ri_bmap.br_startblock = map->me_startblock; - fake.ri_bmap.br_startoff = map->me_startoff; - fake.ri_bmap.br_blockcount = map->me_len; - fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - - xfs_rmap_update_get_group(mp, &fake); - error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, - &rcur); - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - map, sizeof(*map)); - xfs_rmap_update_put_group(&fake); - if (error) - goto abort_error; - - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &ruip->rui_format, + sizeof(ruip->rui_format)); + if (error) + goto abort_error; - xfs_rmap_finish_one_cleanup(tp, rcur, error); return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: - xfs_rmap_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } -STATIC bool -xfs_rui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return RUI_ITEM(lip)->rui_format.rui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_rui_item_relog( +xfs_rmap_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_rud_log_item *rudp; struct xfs_rui_log_item *ruip; struct xfs_map_extent *map; unsigned int count; @@ -628,27 +568,41 @@ xfs_rui_item_relog( count = RUI_ITEM(intent)->rui_format.rui_nextents; map = RUI_ITEM(intent)->rui_format.rui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - ruip = xfs_rui_init(tp->t_mountp, count); memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); - xfs_trans_add_item(tp, &ruip->rui_item); - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + return &ruip->rui_item; } +const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .name = "rmap", + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .create_intent = xfs_rmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rmap_finish_one_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, + .recover_work = xfs_rmap_recover_work, + .relog_intent = xfs_rmap_relog_intent, +}; + +STATIC bool +xfs_rui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return RUI_ITEM(lip)->rui_format.rui_id == intent_id; +} + static const struct xfs_item_ops xfs_rui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, .iop_unpin = xfs_rui_item_unpin, .iop_release = xfs_rui_item_release, - .iop_recover = xfs_rui_item_recover, .iop_match = xfs_rui_item_match, - .iop_relog = xfs_rui_item_relog, }; static inline void @@ -702,12 +656,9 @@ xlog_recover_rui_commit_pass2( ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn); - xfs_rui_release(ruip); + + xlog_recover_intent_item(log, &ruip->rui_item, lsn, + &xfs_rmap_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 88c48de5c9c8..8649d981a097 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -14,28 +14,14 @@ #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_icache.h" #include "xfs_rtalloc.h" #include "xfs_sb.h" #include "xfs_rtbitmap.h" - -/* - * Read and return the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - */ -static int -xfs_rtget_summary( - struct xfs_rtalloc_args *args, - int log, /* log2 of extent size */ - xfs_fileoff_t bbno, /* bitmap block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ -{ - return xfs_rtmodify_summary_int(args, log, bbno, 0, sum); -} +#include "xfs_quota.h" /* * Return whether there are any free extents in the size range given @@ -154,56 +140,55 @@ xfs_rtallocate_range( * properly update the summary. */ error = xfs_rtfind_back(args, start, 0, &preblock); - if (error) { + if (error) return error; - } + /* * Find the next allocated block (end of free extent). */ error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, &postblock); - if (error) { + if (error) return error; - } + /* * Decrement the summary information corresponding to the entire * (old) free extent. */ error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock + 1 - preblock), + xfs_highbit64(postblock + 1 - preblock), xfs_rtx_to_rbmblock(mp, preblock), -1); - if (error) { + if (error) return error; - } + /* * If there are blocks not being allocated at the front of the * old extent, add summary data for them to be free. */ if (preblock < start) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(start - preblock), + xfs_highbit64(start - preblock), xfs_rtx_to_rbmblock(mp, preblock), 1); - if (error) { + if (error) return error; - } } + /* * If there are blocks not being allocated at the end of the * old extent, add summary data for them to be free. */ if (postblock > end) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock - end), + xfs_highbit64(postblock - end), xfs_rtx_to_rbmblock(mp, end + 1), 1); - if (error) { + if (error) return error; - } } + /* * Modify the bitmap to mark this extent allocated. */ - error = xfs_rtmodify_range(args, start, len, 0); - return error; + return xfs_rtmodify_range(args, start, len, 0); } /* @@ -265,21 +250,17 @@ xfs_rtallocate_extent_block( * If it's not so then next will contain the first non-free. */ error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat); - if (error) { + if (error) return error; - } if (stat) { /* * i for maxlen is all free, allocate and return that. */ - error = xfs_rtallocate_range(args, i, maxlen); - if (error) { - return error; - } - *len = maxlen; - *rtx = i; - return 0; + bestlen = maxlen; + besti = i; + goto allocate; } + /* * In the case where we have a variable-sized allocation * request, figure out how big this free piece is, @@ -298,45 +279,44 @@ xfs_rtallocate_extent_block( /* * If not done yet, find the start of the next free space. */ - if (next < end) { - error = xfs_rtfind_forw(args, next, end, &i); - if (error) { - return error; - } - } else + if (next >= end) break; + error = xfs_rtfind_forw(args, next, end, &i); + if (error) + return error; } + /* * Searched the whole thing & didn't find a maxlen free extent. */ - if (minlen < maxlen && besti != -1) { - xfs_rtxlen_t p; /* amount to trim length by */ - + if (minlen > maxlen || besti == -1) { /* - * If size should be a multiple of prod, make that so. + * Allocation failed. Set *nextp to the next block to try. */ - if (prod > 1) { - div_u64_rem(bestlen, prod, &p); - if (p) - bestlen -= p; - } + *nextp = next; + return -ENOSPC; + } - /* - * Allocate besti for bestlen & return that. - */ - error = xfs_rtallocate_range(args, besti, bestlen); - if (error) { - return error; - } - *len = bestlen; - *rtx = besti; - return 0; + /* + * If size should be a multiple of prod, make that so. + */ + if (prod > 1) { + xfs_rtxlen_t p; /* amount to trim length by */ + + div_u64_rem(bestlen, prod, &p); + if (p) + bestlen -= p; } + /* - * Allocation failed. Set *nextp to the next block to try. + * Allocate besti for bestlen & return that. */ - *nextp = next; - *rtx = NULLRTEXTNO; +allocate: + error = xfs_rtallocate_range(args, besti, bestlen); + if (error) + return error; + *len = bestlen; + *rtx = besti; return 0; } @@ -367,52 +347,33 @@ xfs_rtallocate_extent_exact( * Check if the range in question (for maxlen) is free. */ error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree); - if (error) { + if (error) return error; - } - if (isfree) { + + if (!isfree) { /* - * If it is, allocate it and return success. + * If not, allocate what there is, if it's at least minlen. */ - error = xfs_rtallocate_range(args, start, maxlen); - if (error) { - return error; - } - *len = maxlen; - *rtx = start; - return 0; - } - /* - * If not, allocate what there is, if it's at least minlen. - */ - maxlen = next - start; - if (maxlen < minlen) { + maxlen = next - start; + if (maxlen < minlen) + return -ENOSPC; + /* - * Failed, return failure status. + * Trim off tail of extent, if prod is specified. */ - *rtx = NULLRTEXTNO; - return 0; - } - /* - * Trim off tail of extent, if prod is specified. - */ - if (prod > 1 && (i = maxlen % prod)) { - maxlen -= i; - if (maxlen < minlen) { - /* - * Now we can't do it, return failure status. - */ - *rtx = NULLRTEXTNO; - return 0; + if (prod > 1 && (i = maxlen % prod)) { + maxlen -= i; + if (maxlen < minlen) + return -ENOSPC; } } + /* * Allocate what we can and return it. */ error = xfs_rtallocate_range(args, start, maxlen); - if (error) { + if (error) return error; - } *len = maxlen; *rtx = start; return 0; @@ -441,7 +402,6 @@ xfs_rtallocate_extent_near( int j; /* secondary loop control */ int log2len; /* log2 of minlen */ xfs_rtxnum_t n; /* next rtext to try */ - xfs_rtxnum_t r; /* result rtext */ ASSERT(minlen % prod == 0); ASSERT(maxlen % prod == 0); @@ -455,26 +415,18 @@ xfs_rtallocate_extent_near( /* Make sure we don't run off the end of the rt volume. */ maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod); - if (maxlen < minlen) { - *rtx = NULLRTEXTNO; - return 0; - } + if (maxlen < minlen) + return -ENOSPC; /* * Try the exact allocation first. */ error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len, - prod, &r); - if (error) { + prod, rtx); + if (error != -ENOSPC) return error; - } - /* - * If the exact allocation worked, return that. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } + + bbno = xfs_rtx_to_rbmblock(mp, start); i = 0; j = -1; @@ -490,9 +442,9 @@ xfs_rtallocate_extent_near( */ error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1, bbno + i, &maxlog); - if (error) { + if (error) return error; - } + /* * If there are any useful extents starting here, try * allocating one. @@ -511,17 +463,9 @@ xfs_rtallocate_extent_near( */ error = xfs_rtallocate_extent_block(args, bbno + i, minlen, maxavail, len, - &n, prod, &r); - if (error) { + &n, prod, rtx); + if (error != -ENOSPC) return error; - } - /* - * If it worked, return it. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } } /* * On the negative side of the starting location. @@ -555,17 +499,9 @@ xfs_rtallocate_extent_near( error = xfs_rtallocate_extent_block(args, bbno + j, minlen, maxavail, len, &n, prod, - &r); - if (error) { + rtx); + if (error != -ENOSPC) return error; - } - /* - * If it works, return the extent. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } } } } @@ -599,8 +535,53 @@ xfs_rtallocate_extent_near( else break; } - *rtx = NULLRTEXTNO; - return 0; + return -ENOSPC; +} + +static int +xfs_rtalloc_sumlevel( + struct xfs_rtalloc_args *args, + int l, /* level number */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ +{ + xfs_fileoff_t i; /* bitmap block number */ + + for (i = 0; i < args->mp->m_sb.sb_rbmblocks; i++) { + xfs_suminfo_t sum; /* summary information for extents */ + xfs_rtxnum_t n; /* next rtext to be tried */ + int error; + + error = xfs_rtget_summary(args, l, i, &sum); + if (error) + return error; + + /* + * Nothing there, on to the next block. + */ + if (!sum) + continue; + + /* + * Try allocating the extent. + */ + error = xfs_rtallocate_extent_block(args, i, minlen, maxlen, + len, &n, prod, rtx); + if (error != -ENOSPC) + return error; + + /* + * If the "next block to try" returned from the allocator is + * beyond the next bitmap block, skip to that bitmap block. + */ + if (xfs_rtx_to_rbmblock(args->mp, n) > i + 1) + i = xfs_rtx_to_rbmblock(args->mp, n) - 1; + } + + return -ENOSPC; } /* @@ -617,13 +598,8 @@ xfs_rtallocate_extent_size( xfs_rtxlen_t prod, /* extent product factor */ xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - struct xfs_mount *mp = args->mp; int error; - xfs_fileoff_t i; /* bitmap block number */ int l; /* level number (loop control) */ - xfs_rtxnum_t n; /* next rtext to be tried */ - xfs_rtxnum_t r; /* result rtext number */ - xfs_suminfo_t sum; /* summary information for extents */ ASSERT(minlen % prod == 0); ASSERT(maxlen % prod == 0); @@ -631,119 +607,46 @@ xfs_rtallocate_extent_size( /* * Loop over all the levels starting with maxlen. - * At each level, look at all the bitmap blocks, to see if there - * are extents starting there that are long enough (>= maxlen). - * Note, only on the initial level can the allocation fail if - * the summary says there's an extent. + * + * At each level, look at all the bitmap blocks, to see if there are + * extents starting there that are long enough (>= maxlen). + * + * Note, only on the initial level can the allocation fail if the + * summary says there's an extent. */ - for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) { - /* - * Loop over all the bitmap blocks. - */ - for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - /* - * Get the summary for this level/block. - */ - error = xfs_rtget_summary(args, l, i, &sum); - if (error) { - return error; - } - /* - * Nothing there, on to the next block. - */ - if (!sum) - continue; - /* - * Try allocating the extent. - */ - error = xfs_rtallocate_extent_block(args, i, maxlen, - maxlen, len, &n, prod, &r); - if (error) { - return error; - } - /* - * If it worked, return that. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } - /* - * If the "next block to try" returned from the - * allocator is beyond the next bitmap block, - * skip to that bitmap block. - */ - if (xfs_rtx_to_rbmblock(mp, n) > i + 1) - i = xfs_rtx_to_rbmblock(mp, n) - 1; - } + for (l = xfs_highbit32(maxlen); l < args->mp->m_rsumlevels; l++) { + error = xfs_rtalloc_sumlevel(args, l, minlen, maxlen, prod, len, + rtx); + if (error != -ENOSPC) + return error; } + /* - * Didn't find any maxlen blocks. Try smaller ones, unless - * we're asking for a fixed size extent. + * Didn't find any maxlen blocks. Try smaller ones, unless we are + * looking for a fixed size extent. */ - if (minlen > --maxlen) { - *rtx = NULLRTEXTNO; - return 0; - } + if (minlen > --maxlen) + return -ENOSPC; ASSERT(minlen != 0); ASSERT(maxlen != 0); /* * Loop over sizes, from maxlen down to minlen. - * This time, when we do the allocations, allow smaller ones - * to succeed. + * + * This time, when we do the allocations, allow smaller ones to succeed, + * but make sure the specified minlen/maxlen are in the possible range + * for this summary level. */ for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) { - /* - * Loop over all the bitmap blocks, try an allocation - * starting in that block. - */ - for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - /* - * Get the summary information for this level/block. - */ - error = xfs_rtget_summary(args, l, i, &sum); - if (error) { - return error; - } - /* - * If nothing there, go on to next. - */ - if (!sum) - continue; - /* - * Try the allocation. Make sure the specified - * minlen/maxlen are in the possible range for - * this summary level. - */ - error = xfs_rtallocate_extent_block(args, i, - XFS_RTMAX(minlen, 1 << l), - XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), - len, &n, prod, &r); - if (error) { - return error; - } - /* - * If it worked, return that extent. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } - /* - * If the "next block to try" returned from the - * allocator is beyond the next bitmap block, - * skip to that bitmap block. - */ - if (xfs_rtx_to_rbmblock(mp, n) > i + 1) - i = xfs_rtx_to_rbmblock(mp, n) - 1; - } + error = xfs_rtalloc_sumlevel(args, l, + max_t(xfs_rtxlen_t, minlen, 1 << l), + min_t(xfs_rtxlen_t, maxlen, (1 << (l + 1)) - 1), + prod, len, rtx); + if (error != -ENOSPC) + return error; } - /* - * Got nothing, return failure. - */ - *rtx = NULLRTEXTNO; - return 0; + + return -ENOSPC; } /* @@ -963,8 +866,10 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); + if (!xfs_validate_rtextents(nrextents)) + return -EINVAL; nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); - nrextslog = xfs_highbit32(nrextents); + nrextslog = xfs_compute_rextslog(nrextents); nrsumlevels = nrextslog + 1; nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks); nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); @@ -1031,11 +936,14 @@ xfs_growfs_rt( nsbp->sb_rblocks = min(nrblocks, nrblocks_step); nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks); ASSERT(nsbp->sb_rextents != 0); - nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); + nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents); nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nsbp->sb_rbmblocks); nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(nmp, &nmp->m_resv); + /* * Start a transaction, get the log reservation. */ @@ -1122,6 +1030,8 @@ error_cancel: */ mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(mp, &mp->m_resv); error = xfs_trans_commit(tp); if (error) @@ -1160,81 +1070,6 @@ out_free: } /* - * Allocate an extent in the realtime subvolume, with the usual allocation - * parameters. The length units are all in realtime extents, as is the - * result block number. - */ -int -xfs_rtallocate_extent( - struct xfs_trans *tp, - xfs_rtxnum_t start, /* starting rtext number to allocate */ - xfs_rtxlen_t minlen, /* minimum length to allocate */ - xfs_rtxlen_t maxlen, /* maximum length to allocate */ - xfs_rtxlen_t *len, /* out: actual length allocated */ - int wasdel, /* was a delayed allocation extent */ - xfs_rtxlen_t prod, /* extent product factor */ - xfs_rtxnum_t *rtblock) /* out: start rtext allocated */ -{ - struct xfs_rtalloc_args args = { - .mp = tp->t_mountp, - .tp = tp, - }; - int error; /* error value */ - xfs_rtxnum_t r; /* result allocated rtext */ - - ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL)); - ASSERT(minlen > 0 && minlen <= maxlen); - - /* - * If prod is set then figure out what to do to minlen and maxlen. - */ - if (prod > 1) { - xfs_rtxlen_t i; - - if ((i = maxlen % prod)) - maxlen -= i; - if ((i = minlen % prod)) - minlen += prod - i; - if (maxlen < minlen) { - *rtblock = NULLRTEXTNO; - return 0; - } - } - -retry: - if (start == 0) { - error = xfs_rtallocate_extent_size(&args, minlen, - maxlen, len, prod, &r); - } else { - error = xfs_rtallocate_extent_near(&args, start, minlen, - maxlen, len, prod, &r); - } - - xfs_rtbuf_cache_relse(&args); - if (error) - return error; - - /* - * If it worked, update the superblock. - */ - if (r != NULLRTEXTNO) { - long slen = (long)*len; - - ASSERT(*len >= minlen && *len <= maxlen); - if (wasdel) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); - else - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); - } else if (prod > 1) { - prod = 1; - goto retry; - } - - *rtblock = r; - return 0; -} - -/* * Initialize realtime fields in the mount structure. */ int /* error */ @@ -1412,7 +1247,7 @@ xfs_rtunmount_inodes( * of rtextents and the fraction. * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... */ -int /* error */ +static int xfs_rtpick_extent( xfs_mount_t *mp, /* file system mount point */ xfs_trans_t *tp, /* transaction pointer */ @@ -1451,3 +1286,177 @@ xfs_rtpick_extent( *pick = b; return 0; } + +static void +xfs_rtalloc_align_minmax( + xfs_rtxlen_t *raminlen, + xfs_rtxlen_t *ramaxlen, + xfs_rtxlen_t *prod) +{ + xfs_rtxlen_t newmaxlen = *ramaxlen; + xfs_rtxlen_t newminlen = *raminlen; + xfs_rtxlen_t slack; + + slack = newmaxlen % *prod; + if (slack) + newmaxlen -= slack; + slack = newminlen % *prod; + if (slack) + newminlen += *prod - slack; + + /* + * If adjusting for extent size hint alignment produces an invalid + * min/max len combination, go ahead without it. + */ + if (newmaxlen < newminlen) { + *prod = 1; + return; + } + *ramaxlen = newmaxlen; + *raminlen = newminlen; +} + +int +xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_fileoff_t orig_offset = ap->offset; + xfs_rtxnum_t start; /* allocation hint rtextent no */ + xfs_rtxnum_t rtx; /* actually allocated rtextent no */ + xfs_rtxlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t mod = 0; /* product factor for allocators */ + xfs_rtxlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_extlen_t orig_length = ap->length; + xfs_extlen_t minlen = mp->m_sb.sb_rextsize; + xfs_rtxlen_t raminlen; + bool rtlocked = false; + bool ignore_locality = false; + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = ap->tp, + }; + int error; + + align = xfs_get_extsz_hint(ap->ip); +retry: + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 1, ap->eof, 0, + ap->conv, &ap->offset, &ap->length); + if (error) + return error; + ASSERT(ap->length); + ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0); + + /* + * If we shifted the file offset downward to satisfy an extent size + * hint, increase minlen by that amount so that the allocator won't + * give us an allocation that's too short to cover at least one of the + * blocks that the caller asked for. + */ + if (ap->offset != orig_offset) + minlen += orig_offset - ap->offset; + + /* + * Set ralen to be the actual requested length in rtextents. + * + * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); + raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); + ASSERT(raminlen > 0); + ASSERT(raminlen <= ralen); + + /* + * Lock out modifications to both the RT bitmap and summary inodes + */ + if (!rtlocked) { + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); + xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); + rtlocked = true; + } + + if (ignore_locality) { + start = 0; + } else if (xfs_bmap_adjacent(ap)) { + start = xfs_rtb_to_rtx(mp, ap->blkno); + } else if (ap->eof && ap->offset == 0) { + /* + * If it's an allocation to an empty file at offset 0, pick an + * extent that will space things out in the rt area. + */ + error = xfs_rtpick_extent(mp, ap->tp, ralen, &start); + if (error) + return error; + } else { + start = 0; + } + + /* + * Only bother calculating a real prod factor if offset & length are + * perfectly aligned, otherwise it will just get us in trouble. + */ + div_u64_rem(ap->offset, align, &mod); + if (mod || ap->length % align) { + prod = 1; + } else { + prod = xfs_extlen_to_rtxlen(mp, align); + if (prod > 1) + xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod); + } + + if (start) { + error = xfs_rtallocate_extent_near(&args, start, raminlen, + ralen, &ralen, prod, &rtx); + } else { + error = xfs_rtallocate_extent_size(&args, raminlen, + ralen, &ralen, prod, &rtx); + } + xfs_rtbuf_cache_relse(&args); + + if (error == -ENOSPC) { + if (align > mp->m_sb.sb_rextsize) { + /* + * We previously enlarged the request length to try to + * satisfy an extent size hint. The allocator didn't + * return anything, so reset the parameters to the + * original values and try again without alignment + * criteria. + */ + ap->offset = orig_offset; + ap->length = orig_length; + minlen = align = mp->m_sb.sb_rextsize; + goto retry; + } + + if (!ignore_locality && start != 0) { + /* + * If we can't allocate near a specific rt extent, try + * again without locality criteria. + */ + ignore_locality = true; + goto retry; + } + + ap->blkno = NULLFSBLOCK; + ap->length = 0; + return 0; + } + if (error) + return error; + + xfs_trans_mod_sb(ap->tp, ap->wasdel ? + XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS, + -(long)ralen); + ap->blkno = xfs_rtx_to_rtb(mp, rtx); + ap->length = xfs_rtxlen_to_extlen(mp, ralen); + xfs_bmap_alloc_account(ap); + return 0; +} diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index f7cb9ffe51ca..a6836da9bebe 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -13,27 +13,6 @@ struct xfs_trans; #ifdef CONFIG_XFS_RT /* - * Function prototypes for exported functions. - */ - -/* - * Allocate an extent in the realtime subvolume, with the usual allocation - * parameters. The length units are all in realtime extents, as is the - * result block number. - */ -int /* error */ -xfs_rtallocate_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtxnum_t start, /* starting rtext number to allocate */ - xfs_rtxlen_t minlen, /* minimum length to allocate */ - xfs_rtxlen_t maxlen, /* maximum length to allocate */ - xfs_rtxlen_t *len, /* out: actual length allocated */ - int wasdel, /* was a delayed allocation extent */ - xfs_rtxlen_t prod, /* extent product factor */ - xfs_rtxnum_t *rtblock); /* out: start rtext allocated */ - - -/* * Initialize realtime fields in the mount structure. */ int /* error */ @@ -52,20 +31,6 @@ xfs_rtmount_inodes( struct xfs_mount *mp); /* file system mount structure */ /* - * Pick an extent for allocation at the start of a new realtime file. - * Use the sequence number stored in the atime field of the bitmap inode. - * Translate this to a fraction of the rtextents, and return the product - * of rtextents and the fraction. - * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... - */ -int /* error */ -xfs_rtpick_extent( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtxlen_t len, /* allocation length (rtextents) */ - xfs_rtxnum_t *pick); /* result rt extent */ - -/* * Grow the realtime area of the filesystem. */ int @@ -75,8 +40,6 @@ xfs_growfs_rt( int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else -# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS) -# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS) # define xfs_growfs_rt(mp,in) (-ENOSYS) # define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 07857d967ee8..aff20ddd4a9f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -894,10 +894,8 @@ xfs_fs_statfs( STATIC void xfs_save_resvblks(struct xfs_mount *mp) { - uint64_t resblks = 0; - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_reserve_blocks(mp, 0); } STATIC void @@ -911,7 +909,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) } else resblks = xfs_default_resblks(mp); - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_reserve_blocks(mp, resblks); } /* diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 85e433df6a3f..92974a4414c8 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -23,6 +23,7 @@ #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_error.h" +#include "xfs_health.h" /* ----- Kernel only functions below ----- */ int @@ -108,6 +109,8 @@ xfs_readlink( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(ip, XFS_DATA_FORK)) + return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); @@ -128,10 +131,10 @@ xfs_readlink( * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED * if if_data is junk. */ - if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data)) + if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data)) goto out; - memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1); + memcpy(link, ip->i_df.if_data, pathlen + 1); error = 0; } else { error = xfs_readlink_bmap_ilocked(ip, link); diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index f78ad6b10ea5..276696a07040 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -85,6 +85,8 @@ struct xfs_globals { int pwork_threads; /* parallel workqueue threads */ bool larp; /* log attribute replay */ #endif + int bload_leaf_slack; /* btree bulk load leaf slack */ + int bload_node_slack; /* btree bulk load node slack */ int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index a3c6b1548723..17485666b672 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -229,6 +229,15 @@ pwork_threads_show( } XFS_SYSFS_ATTR_RW(pwork_threads); +/* + * The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob + * sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on + * V5 filesystems. As a result, the intermediate progress of all setxattr and + * removexattr operations are tracked via the log and can be restarted during + * recovery. This is useful for testing xattr recovery prior to merging of the + * parent pointer feature which requires it to maintain consistency, and may be + * enabled for userspace xattrs in the future. + */ static ssize_t larp_store( struct kobject *kobject, @@ -253,6 +262,58 @@ larp_show( XFS_SYSFS_ATTR_RW(larp); #endif /* DEBUG */ +STATIC ssize_t +bload_leaf_slack_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + xfs_globals.bload_leaf_slack = val; + return count; +} + +STATIC ssize_t +bload_leaf_slack_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack); +} +XFS_SYSFS_ATTR_RW(bload_leaf_slack); + +STATIC ssize_t +bload_node_slack_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + xfs_globals.bload_node_slack = val; + return count; +} + +STATIC ssize_t +bload_node_slack_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack); +} +XFS_SYSFS_ATTR_RW(bload_node_slack); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), @@ -262,6 +323,8 @@ static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(pwork_threads), ATTR_LIST(larp), #endif + ATTR_LIST(bload_leaf_slack), + ATTR_LIST(bload_node_slack), NULL, }; ATTRIBUTE_GROUPS(xfs_dbg); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3926cf7f2a6e..0984a1c884c7 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -67,6 +67,7 @@ struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; +struct xfs_defer_op_type; struct xfs_refcount_irec; struct xfs_fsmap; struct xfs_rmap_irec; @@ -145,21 +146,23 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); TRACE_EVENT(xlog_intent_recovery_failed, - TP_PROTO(struct xfs_mount *mp, int error, void *function), - TP_ARGS(mp, error, function), + TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops, + int error), + TP_ARGS(mp, ops, error), TP_STRUCT__entry( __field(dev_t, dev) + __string(name, ops->name) __field(int, error) - __field(void *, function) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; + __assign_str(name, ops->name); __entry->error = error; - __entry->function = function; ), - TP_printk("dev %d:%d error %d function %pS", + TP_printk("dev %d:%d optype %s error %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->error, __entry->function) + __get_str(name), + __entry->error) ); DECLARE_EVENT_CLASS(xfs_perag_class, @@ -2549,22 +2552,25 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, TP_ARGS(mp, dfp), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, type) + __string(name, dfp->dfp_ops->name) __field(void *, intent) + __field(unsigned int, flags) __field(char, committed) __field(int, nr) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type; + __assign_str(name, dfp->dfp_ops->name); __entry->intent = dfp->dfp_intent; + __entry->flags = dfp->dfp_flags; __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p committed %d nr %d", + TP_printk("dev %d:%d optype %s intent %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __get_str(name), __entry->intent, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) @@ -2675,6 +2681,9 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); @@ -2688,25 +2697,28 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, TP_ARGS(mp, dfp, item), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, type) + __string(name, dfp->dfp_ops->name) __field(void *, intent) __field(void *, item) __field(char, committed) + __field(unsigned int, flags) __field(int, nr) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type; + __assign_str(name, dfp->dfp_ops->name); __entry->intent = dfp->dfp_intent; __entry->item = item; __entry->committed = dfp->dfp_done != NULL; + __entry->flags = dfp->dfp_flags; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d", + TP_printk("dev %d:%d optype %s intent %p item %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __get_str(name), __entry->intent, __entry->item, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) @@ -4399,8 +4411,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); -DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace); -DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove); TRACE_EVENT(xfs_force_shutdown, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 305c9d07bf1b..12d45e93f07d 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1237,6 +1237,68 @@ out_cancel: } /* + * Try to reserve more blocks for a transaction. + * + * This is for callers that need to attach resources to a transaction, scan + * those resources to determine the space reservation requirements, and then + * modify the attached resources. In other words, online repair. This can + * fail due to ENOSPC, so the caller must be able to cancel the transaction + * without shutting down the fs. + */ +int +xfs_trans_reserve_more( + struct xfs_trans *tp, + unsigned int blocks, + unsigned int rtextents) +{ + struct xfs_trans_res resv = { }; + + return xfs_trans_reserve(tp, &resv, blocks, rtextents); +} + +/* + * Try to reserve more blocks and file quota for a transaction. Same + * conditions of usage as xfs_trans_reserve_more. + */ +int +xfs_trans_reserve_more_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + unsigned int dblocks, + unsigned int rblocks, + bool force_quota) +{ + struct xfs_trans_res resv = { }; + struct xfs_mount *mp = ip->i_mount; + unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks); + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + error = xfs_trans_reserve(tp, &resv, dblocks, rtx); + if (error) + return error; + + if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + return 0; + + if (tp->t_flags & XFS_TRANS_RESERVE) + force_quota = true; + + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, + force_quota); + if (!error) + return 0; + + /* Quota failed, give back the new reservation. */ + xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE); + tp->t_blk_res -= dblocks; + xfs_mod_frextents(mp, rtx); + tp->t_rtx_res -= rtx; + return error; +} + +/* * Allocate an transaction in preparation for inode creation by reserving quota * against the given dquots. Callers are not required to hold any inode locks. */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 6e3646d524ce..08ce757c7454 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -78,11 +78,7 @@ struct xfs_item_ops { xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_release)(struct xfs_log_item *); - int (*iop_recover)(struct xfs_log_item *lip, - struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); - struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, - struct xfs_trans *tp); struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; @@ -168,6 +164,8 @@ typedef struct xfs_trans { int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, uint blocks, uint rtextents, uint flags, struct xfs_trans **tpp); +int xfs_trans_reserve_more(struct xfs_trans *tp, + unsigned int blocks, unsigned int rtextents); int xfs_trans_alloc_empty(struct xfs_mount *mp, struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -247,19 +245,13 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern struct kmem_cache *xfs_trans_cache; -static inline struct xfs_log_item * -xfs_trans_item_relog( - struct xfs_log_item *lip, - struct xfs_trans *tp) -{ - return lip->li_ops->iop_relog(lip, tp); -} - struct xfs_dquot; int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, unsigned int dblocks, unsigned int rblocks, bool force, struct xfs_trans **tpp); +int xfs_trans_reserve_more_inode(struct xfs_trans *tp, struct xfs_inode *ip, + unsigned int dblocks, unsigned int rblocks, bool force_quota); int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int dblocks, diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 987843f84d03..364104e1b38a 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -136,6 +136,9 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, }; int error; + if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) + return -EIO; + error = xfs_attr_get(&args); if (error) return error; @@ -294,6 +297,9 @@ xfs_vn_listxattr( struct inode *inode = d_inode(dentry); int error; + if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) + return -EIO; + /* * First read the regular on-disk attributes. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 896c0079f64f..f5a97dec5169 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3942,6 +3942,7 @@ enum mf_flags { MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, MF_NO_RETRY = 1 << 6, + MF_MEM_PRE_REMOVE = 1 << 7, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a0d9b4ac7d54..4f9b61f4a668 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -678,7 +678,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, */ static void collect_procs_fsdax(struct page *page, struct address_space *mapping, pgoff_t pgoff, - struct list_head *to_kill) + struct list_head *to_kill, bool pre_remove) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -686,8 +686,15 @@ static void collect_procs_fsdax(struct page *page, i_mmap_lock_read(mapping); rcu_read_lock(); for_each_process(tsk) { - struct task_struct *t = task_early_kill(tsk, true); + struct task_struct *t = tsk; + /* + * Search for all tasks while MF_MEM_PRE_REMOVE is set, because + * the current may not be the one accessing the fsdax page. + * Otherwise, search for the current task. + */ + if (!pre_remove) + t = task_early_kill(tsk, true); if (!t) continue; vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, dax_entry_t cookie; struct page *page; size_t end = index + count; + bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE; mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; @@ -1804,9 +1812,14 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, if (!page) goto unlock; - SetPageHWPoison(page); + if (!pre_remove) + SetPageHWPoison(page); - collect_procs_fsdax(page, mapping, index, &to_kill); + /* + * The pre_remove case is revoking access, the memory is still + * good and could theoretically be put back into service. + */ + collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove); unmap_and_kill(&to_kill, page_to_pfn(page), mapping, index, mf_flags); unlock: |