146 files changed, 12727 insertions, 2943 deletions
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 09cade7eaefc..e18bc5ae3b35 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -121,8 +121,5 @@ Documentation for filesystem implementations.
    udf
    virtiofs
    vfat
-   xfs-delayed-logging-design
-   xfs-maintainer-entry-profile
-   xfs-self-describing-metadata
-   xfs-online-fsck-design
+   xfs/index
    zonefs
diff --git a/Documentation/filesystems/xfs/index.rst b/Documentation/filesystems/xfs/index.rst
new file mode 100644
index 000000000000..ab66c57a5d18
--- /dev/null
+++ b/Documentation/filesystems/xfs/index.rst
@@ -0,0 +1,14 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================
+XFS Filesystem Documentation
+============================
+
+.. toctree::
+   :maxdepth: 2
+   :numbered:
+
+   xfs-delayed-logging-design
+   xfs-maintainer-entry-profile
+   xfs-self-describing-metadata
+   xfs-online-fsck-design
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst
index 6402ab8e370c..6402ab8e370c 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.rst
+++ b/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst
diff --git a/Documentation/filesystems/xfs-maintainer-entry-profile.rst b/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst
index 32b6ac4ca9d6..32b6ac4ca9d6 100644
--- a/Documentation/filesystems/xfs-maintainer-entry-profile.rst
+++ b/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst
diff --git a/Documentation/filesystems/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst
index a0678101a7d0..352516feef6f 100644
--- a/Documentation/filesystems/xfs-online-fsck-design.rst
+++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst
@@ -962,7 +962,7 @@ disk, but these buffer verifiers cannot provide any consistency checking
 between metadata structures.
 
 For more information, please see the documentation for
-Documentation/filesystems/xfs-self-describing-metadata.rst
+Documentation/filesystems/xfs/xfs-self-describing-metadata.rst
 
 Reverse Mapping
 ---------------
diff --git a/Documentation/filesystems/xfs-self-describing-metadata.rst b/Documentation/filesystems/xfs/xfs-self-describing-metadata.rst
index a10c4ae6955e..a10c4ae6955e 100644
--- a/Documentation/filesystems/xfs-self-describing-metadata.rst
+++ b/Documentation/filesystems/xfs/xfs-self-describing-metadata.rst
diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst
index 7ad4bfc2cc03..18cee1edaecb 100644
--- a/Documentation/maintainer/maintainer-entry-profile.rst
+++ b/Documentation/maintainer/maintainer-entry-profile.rst
@@ -105,4 +105,4 @@ to do something different in the near future.
    ../driver-api/media/maintainer-entry-profile
    ../driver-api/vfio-pci-device-specific-driver-acceptance
    ../nvme/feature-and-quirk-policy
-   ../filesystems/xfs-maintainer-entry-profile
+   ../filesystems/xfs/xfs-maintainer-entry-profile
diff --git a/MAINTAINERS b/MAINTAINERS
index 40c754b4c39c..83240dcd4065 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23846,10 +23846,10 @@ S:	Supported
 W:	http://xfs.org/
 C:	irc://irc.oftc.net/xfs
 T:	git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git
-P:	Documentation/filesystems/xfs-maintainer-entry-profile.rst
+P:	Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst
 F:	Documentation/ABI/testing/sysfs-fs-xfs
 F:	Documentation/admin-guide/xfs.rst
-F:	Documentation/filesystems/xfs-*
+F:	Documentation/filesystems/xfs/*
 F:	fs/xfs/
 F:	include/uapi/linux/dqblk_xfs.h
 F:	include/uapi/linux/fsmap.h
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 0da9232ea175..f4b635526345 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -326,7 +326,8 @@ void kill_dax(struct dax_device *dax_dev)
 		return;
 
 	if (dax_dev->holder_data != NULL)
-		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
+		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
+				MF_MEM_PRE_REMOVE);
 
 	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
 	synchronize_srcu(&dax_srcu);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7762c01a85cf..fbe3cdc79036 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -145,6 +145,7 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
 
 xfs-y				+= $(addprefix scrub/, \
 				   trace.o \
+				   agb_bitmap.o \
 				   agheader.o \
 				   alloc.o \
 				   attr.o \
@@ -175,14 +176,32 @@ xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 				   rtsummary.o \
 				   )
 
-xfs-$(CONFIG_XFS_QUOTA)		+= scrub/quota.o
+xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix scrub/, \
+				   dqiterate.o \
+				   quota.o \
+				   )
 
 # online repair
 ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
 xfs-y				+= $(addprefix scrub/, \
 				   agheader_repair.o \
+				   alloc_repair.o \
+				   bmap_repair.o \
+				   cow_repair.o \
+				   ialloc_repair.o \
+				   inode_repair.o \
+				   newbt.o \
 				   reap.o \
+				   refcount_repair.o \
 				   repair.o \
 				   )
+
+xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
+				   rtbitmap_repair.o \
+				   )
+
+xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix scrub/, \
+				   quota_repair.o \
+				   )
 endif
 endif
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index f9f4d694640d..39d9525270b7 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -332,6 +332,31 @@ xfs_agino_range(
 	return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
 }
 
+/*
+ * Free perag within the specified AG range, it is only used to free unused
+ * perags under the error handling path.
+ */
+void
+xfs_free_unused_perag_range(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agstart,
+	xfs_agnumber_t		agend)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		index;
+
+	for (index = agstart; index < agend; index++) {
+		spin_lock(&mp->m_perag_lock);
+		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		spin_unlock(&mp->m_perag_lock);
+		if (!pag)
+			break;
+		xfs_buf_hash_destroy(pag);
+		xfs_defer_drain_free(&pag->pag_intents_drain);
+		kmem_free(pag);
+	}
+}
+
 int
 xfs_initialize_perag(
 	struct xfs_mount	*mp,
@@ -424,19 +449,14 @@ xfs_initialize_perag(
 
 out_remove_pag:
 	xfs_defer_drain_free(&pag->pag_intents_drain);
+	spin_lock(&mp->m_perag_lock);
 	radix_tree_delete(&mp->m_perag_tree, index);
+	spin_unlock(&mp->m_perag_lock);
 out_free_pag:
 	kmem_free(pag);
 out_unwind_new_pags:
 	/* unwind any prior newly initialized pags */
-	for (index = first_initialised; index < agcount; index++) {
-		pag = radix_tree_delete(&mp->m_perag_tree, index);
-		if (!pag)
-			break;
-		xfs_buf_hash_destroy(pag);
-		xfs_defer_drain_free(&pag->pag_intents_drain);
-		kmem_free(pag);
-	}
+	xfs_free_unused_perag_range(mp, first_initialised, agcount);
 	return error;
 }
 
@@ -984,7 +1004,7 @@ xfs_ag_shrink_space(
 		if (err2 != -ENOSPC)
 			goto resv_err;
 
-		err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
+		err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
 				XFS_AG_RESV_NONE, true);
 		if (err2)
 			goto resv_err;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 2e0aef87d633..4b343c4fac28 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -80,6 +80,16 @@ struct xfs_perag {
 	 */
 	uint16_t	pag_checked;
 	uint16_t	pag_sick;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+	/*
+	 * Alternate btree heights so that online repair won't trip the write
+	 * verifiers while rebuilding the AG btrees.
+	 */
+	uint8_t		pagf_repair_levels[XFS_BTNUM_AGF];
+	uint8_t		pagf_repair_refcount_level;
+#endif
+
 	spinlock_t	pag_state_lock;
 
 	spinlock_t	pagb_lock;	/* lock for pagb_tree */
@@ -133,6 +143,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA)
 __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES)
 __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET)
 
+void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart,
+			xfs_agnumber_t agend);
 int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
 			xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
 int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 7fd1fea95552..da1057bd0e60 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -411,6 +411,8 @@ xfs_ag_resv_free_extent(
 		fallthrough;
 	case XFS_AG_RESV_NONE:
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+		fallthrough;
+	case XFS_AG_RESV_IGNORE:
 		return;
 	}
 
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 100ab5931b31..3bd0a33fee0a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -246,11 +246,9 @@ xfs_alloc_btrec_to_irec(
 /* Simple checks for free space records. */
 xfs_failaddr_t
 xfs_alloc_check_irec(
-	struct xfs_btree_cur		*cur,
-	const struct xfs_alloc_rec_incore *irec)
+	struct xfs_perag			*pag,
+	const struct xfs_alloc_rec_incore	*irec)
 {
-	struct xfs_perag		*pag = cur->bc_ag.pag;
-
 	if (irec->ar_blockcount == 0)
 		return __this_address;
 
@@ -299,7 +297,7 @@ xfs_alloc_get_rec(
 		return error;
 
 	xfs_alloc_btrec_to_irec(rec, &irec);
-	fa = xfs_alloc_check_irec(cur, &irec);
+	fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
 	if (fa)
 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
 
@@ -2514,7 +2512,7 @@ xfs_defer_agfl_block(
 	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
 
 	xfs_extent_free_get_group(mp, xefi);
-	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
+	xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
 	return 0;
 }
 
@@ -2522,14 +2520,15 @@ xfs_defer_agfl_block(
  * Add the extent to the list of extents to be free at transaction end.
  * The list is maintained sorted (by block number).
  */
-int
-__xfs_free_extent_later(
+static int
+xfs_defer_extent_free(
 	struct xfs_trans		*tp,
 	xfs_fsblock_t			bno,
 	xfs_filblks_t			len,
 	const struct xfs_owner_info	*oinfo,
 	enum xfs_ag_resv_type		type,
-	bool				skip_discard)
+	bool				skip_discard,
+	struct xfs_defer_pending	**dfpp)
 {
 	struct xfs_extent_free_item	*xefi;
 	struct xfs_mount		*mp = tp->t_mountp;
@@ -2577,10 +2576,105 @@ __xfs_free_extent_later(
 			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
 
 	xfs_extent_free_get_group(mp, xefi);
-	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
+	*dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
 	return 0;
 }
 
+int
+xfs_free_extent_later(
+	struct xfs_trans		*tp,
+	xfs_fsblock_t			bno,
+	xfs_filblks_t			len,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type		type,
+	bool				skip_discard)
+{
+	struct xfs_defer_pending	*dontcare = NULL;
+
+	return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard,
+			&dontcare);
+}
+
+/*
+ * Set up automatic freeing of unwritten space in the filesystem.
+ *
+ * This function attached a paused deferred extent free item to the
+ * transaction.  Pausing means that the EFI will be logged in the next
+ * transaction commit, but the pending EFI will not be finished until the
+ * pending item is unpaused.
+ *
+ * If the system goes down after the EFI has been persisted to the log but
+ * before the pending item is unpaused, log recovery will find the EFI, fail to
+ * find the EFD, and free the space.
+ *
+ * If the pending item is unpaused, the next transaction commit will log an EFD
+ * without freeing the space.
+ *
+ * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the
+ * @args structure are set to the relevant values.
+ */
+int
+xfs_alloc_schedule_autoreap(
+	const struct xfs_alloc_arg	*args,
+	bool				skip_discard,
+	struct xfs_alloc_autoreap	*aarp)
+{
+	int				error;
+
+	error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
+			&args->oinfo, args->resv, skip_discard, &aarp->dfp);
+	if (error)
+		return error;
+
+	xfs_defer_item_pause(args->tp, aarp->dfp);
+	return 0;
+}
+
+/*
+ * Cancel automatic freeing of unwritten space in the filesystem.
+ *
+ * Earlier, we created a paused deferred extent free item and attached it to
+ * this transaction so that we could automatically roll back a new space
+ * allocation if the system went down.  Now we want to cancel the paused work
+ * item by marking the EFI stale so we don't actually free the space, unpausing
+ * the pending item and logging an EFD.
+ *
+ * The caller generally should have already mapped the space into the ondisk
+ * filesystem.  If the reserved space was partially used, the caller must call
+ * xfs_free_extent_later to create a new EFI to free the unused space.
+ */
+void
+xfs_alloc_cancel_autoreap(
+	struct xfs_trans		*tp,
+	struct xfs_alloc_autoreap	*aarp)
+{
+	struct xfs_defer_pending	*dfp = aarp->dfp;
+	struct xfs_extent_free_item	*xefi;
+
+	if (!dfp)
+		return;
+
+	list_for_each_entry(xefi, &dfp->dfp_work, xefi_list)
+		xefi->xefi_flags |= XFS_EFI_CANCELLED;
+
+	xfs_defer_item_unpause(tp, dfp);
+}
+
+/*
+ * Commit automatic freeing of unwritten space in the filesystem.
+ *
+ * This unpauses an earlier _schedule_autoreap and commits to freeing the
+ * allocated space.  Call this if none of the reserved space was used.
+ */
+void
+xfs_alloc_commit_autoreap(
+	struct xfs_trans		*tp,
+	struct xfs_alloc_autoreap	*aarp)
+{
+	if (aarp->dfp)
+		xfs_defer_item_unpause(tp, aarp->dfp);
+}
+
 #ifdef DEBUG
 /*
  * Check if an AGF has a free extent record whose length is equal to
@@ -3848,7 +3942,7 @@ xfs_alloc_query_range_helper(
 	xfs_failaddr_t				fa;
 
 	xfs_alloc_btrec_to_irec(rec, &irec);
-	fa = xfs_alloc_check_irec(cur, &irec);
+	fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
 	if (fa)
 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
 
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6bb8d295c321..0b956f8b9d5a 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -185,7 +185,7 @@ xfs_alloc_get_rec(
 union xfs_btree_rec;
 void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_alloc_rec_incore *irec);
-xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_alloc_check_irec(struct xfs_perag *pag,
 		const struct xfs_alloc_rec_incore *irec);
 
 int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
@@ -231,7 +231,7 @@ xfs_buf_to_agfl_bno(
 	return bp->b_addr;
 }
 
-int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
 		xfs_filblks_t len, const struct xfs_owner_info *oinfo,
 		enum xfs_ag_resv_type type, bool skip_discard);
 
@@ -255,18 +255,18 @@ void xfs_extent_free_get_group(struct xfs_mount *mp,
 #define XFS_EFI_SKIP_DISCARD	(1U << 0) /* don't issue discard */
 #define XFS_EFI_ATTR_FORK	(1U << 1) /* freeing attr fork block */
 #define XFS_EFI_BMBT_BLOCK	(1U << 2) /* freeing bmap btree block */
+#define XFS_EFI_CANCELLED	(1U << 3) /* dont actually free the space */
 
-static inline int
-xfs_free_extent_later(
-	struct xfs_trans		*tp,
-	xfs_fsblock_t			bno,
-	xfs_filblks_t			len,
-	const struct xfs_owner_info	*oinfo,
-	enum xfs_ag_resv_type		type)
-{
-	return __xfs_free_extent_later(tp, bno, len, oinfo, type, false);
-}
+struct xfs_alloc_autoreap {
+	struct xfs_defer_pending	*dfp;
+};
 
+int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args,
+		bool skip_discard, struct xfs_alloc_autoreap *aarp);
+void xfs_alloc_cancel_autoreap(struct xfs_trans *tp,
+		struct xfs_alloc_autoreap *aarp);
+void xfs_alloc_commit_autoreap(struct xfs_trans *tp,
+		struct xfs_alloc_autoreap *aarp);
 
 extern struct kmem_cache	*xfs_extfree_item_cache;
 
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index c65228efed4a..a7032bf0cd37 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -323,7 +323,18 @@ xfs_allocbt_verify(
 	if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
 		btnum = XFS_BTNUM_CNTi;
 	if (pag && xfs_perag_initialised_agf(pag)) {
-		if (level >= pag->pagf_levels[btnum])
+		unsigned int	maxlevel = pag->pagf_levels[btnum];
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+		/*
+		 * Online repair could be rewriting the free space btrees, so
+		 * we'll validate against the larger of either tree while this
+		 * is going on.
+		 */
+		maxlevel = max_t(unsigned int, maxlevel,
+				 pag->pagf_repair_levels[btnum]);
+#endif
+		if (level >= maxlevel)
 			return __this_address;
 	} else if (level >= mp->m_alloc_maxlevels)
 		return __this_address;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e28d93d232de..9976a00a73f9 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -862,8 +862,11 @@ xfs_attr_lookup(
 	if (!xfs_inode_hasattr(dp))
 		return -ENOATTR;
 
-	if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
-		return xfs_attr_sf_findname(args, NULL, NULL);
+	if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) {
+		if (xfs_attr_sf_findname(args))
+			return -EEXIST;
+		return -ENOATTR;
+	}
 
 	if (xfs_attr_is_leaf(dp)) {
 		error = xfs_attr_leaf_hasname(args, &bp);
@@ -880,11 +883,10 @@ xfs_attr_lookup(
 	return error;
 }
 
-static int
-xfs_attr_intent_init(
+static void
+xfs_attr_defer_add(
 	struct xfs_da_args	*args,
-	unsigned int		op_flags,	/* op flag (set or remove) */
-	struct xfs_attr_intent	**attr)		/* new xfs_attr_intent */
+	unsigned int		op_flags)
 {
 
 	struct xfs_attr_intent	*new;
@@ -893,66 +895,22 @@ xfs_attr_intent_init(
 	new->xattri_op_flags = op_flags;
 	new->xattri_da_args = args;
 
-	*attr = new;
-	return 0;
-}
-
-/* Sets an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_add(
-	struct xfs_da_args	*args)
-{
-	struct xfs_attr_intent	*new;
-	int			error = 0;
-
-	error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new);
-	if (error)
-		return error;
+	switch (op_flags) {
+	case XFS_ATTRI_OP_FLAGS_SET:
+		new->xattri_dela_state = xfs_attr_init_add_state(args);
+		break;
+	case XFS_ATTRI_OP_FLAGS_REPLACE:
+		new->xattri_dela_state = xfs_attr_init_replace_state(args);
+		break;
+	case XFS_ATTRI_OP_FLAGS_REMOVE:
+		new->xattri_dela_state = xfs_attr_init_remove_state(args);
+		break;
+	default:
+		ASSERT(0);
+	}
 
-	new->xattri_dela_state = xfs_attr_init_add_state(args);
-	xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+	xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type);
 	trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
-
-	return 0;
-}
-
-/* Sets an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_replace(
-	struct xfs_da_args	*args)
-{
-	struct xfs_attr_intent	*new;
-	int			error = 0;
-
-	error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new);
-	if (error)
-		return error;
-
-	new->xattri_dela_state = xfs_attr_init_replace_state(args);
-	xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
-	trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
-
-	return 0;
-}
-
-/* Removes an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_remove(
-	struct xfs_da_args	*args)
-{
-
-	struct xfs_attr_intent	*new;
-	int			error;
-
-	error  = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new);
-	if (error)
-		return error;
-
-	new->xattri_dela_state = xfs_attr_init_remove_state(args);
-	xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
-	trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
-
-	return 0;
 }
 
 /*
@@ -1038,16 +996,16 @@ xfs_attr_set(
 	error = xfs_attr_lookup(args);
 	switch (error) {
 	case -EEXIST:
-		/* if no value, we are performing a remove operation */
 		if (!args->value) {
-			error = xfs_attr_defer_remove(args);
+			/* if no value, we are performing a remove operation */
+			xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE);
 			break;
 		}
+
 		/* Pure create fails if the attr already exists */
 		if (args->attr_flags & XATTR_CREATE)
 			goto out_trans_cancel;
-
-		error = xfs_attr_defer_replace(args);
+		xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE);
 		break;
 	case -ENOATTR:
 		/* Can't remove what isn't there. */
@@ -1057,14 +1015,11 @@ xfs_attr_set(
 		/* Pure replace fails if no existing attr to replace. */
 		if (args->attr_flags & XATTR_REPLACE)
 			goto out_trans_cancel;
-
-		error = xfs_attr_defer_add(args);
+		xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET);
 		break;
 	default:
 		goto out_trans_cancel;
 	}
-	if (error)
-		goto out_trans_cancel;
 
 	/*
 	 * If this is a synchronous mount, make sure that the
@@ -1097,10 +1052,9 @@ out_trans_cancel:
 
 static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
 {
-	struct xfs_attr_shortform *sf;
+	struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
 
-	sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
-	return be16_to_cpu(sf->hdr.totsize);
+	return be16_to_cpu(sf->totsize);
 }
 
 /*
@@ -1112,19 +1066,13 @@ xfs_attr_shortform_addname(
 	struct xfs_da_args	*args)
 {
 	int			newsize, forkoff;
-	int			error;
 
 	trace_xfs_attr_sf_addname(args);
 
-	error = xfs_attr_shortform_lookup(args);
-	switch (error) {
-	case -ENOATTR:
-		if (args->op_flags & XFS_DA_OP_REPLACE)
-			return error;
-		break;
-	case -EEXIST:
-		if (!(args->op_flags & XFS_DA_OP_REPLACE))
-			return error;
+	if (xfs_attr_sf_findname(args)) {
+		int		error;
+
+		ASSERT(args->op_flags & XFS_DA_OP_REPLACE);
 
 		error = xfs_attr_sf_removename(args);
 		if (error)
@@ -1137,11 +1085,8 @@ xfs_attr_shortform_addname(
 		 * around.
 		 */
 		args->op_flags &= ~XFS_DA_OP_REPLACE;
-		break;
-	case 0:
-		break;
-	default:
-		return error;
+	} else {
+		ASSERT(!(args->op_flags & XFS_DA_OP_REPLACE));
 	}
 
 	if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2580ae47209a..6374bf107242 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -690,56 +690,32 @@ xfs_attr_shortform_create(
 	ASSERT(ifp->if_bytes == 0);
 	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS)
 		ifp->if_format = XFS_DINODE_FMT_LOCAL;
-	xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
-	hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data;
+
+	hdr = xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
 	memset(hdr, 0, sizeof(*hdr));
 	hdr->totsize = cpu_to_be16(sizeof(*hdr));
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
 }
 
 /*
- * Return -EEXIST if attr is found, or -ENOATTR if not
- * args:  args containing attribute name and namelen
- * sfep:  If not null, pointer will be set to the last attr entry found on
-	  -EEXIST.  On -ENOATTR pointer is left at the last entry in the list
- * basep: If not null, pointer is set to the byte offset of the entry in the
- *	  list on -EEXIST.  On -ENOATTR, pointer is left at the byte offset of
- *	  the last entry in the list
+ * Return the entry if the attr in args is found, or NULL if not.
  */
-int
+struct xfs_attr_sf_entry *
 xfs_attr_sf_findname(
-	struct xfs_da_args	 *args,
-	struct xfs_attr_sf_entry **sfep,
-	unsigned int		 *basep)
+	struct xfs_da_args		*args)
 {
-	struct xfs_attr_shortform *sf;
-	struct xfs_attr_sf_entry *sfe;
-	unsigned int		base = sizeof(struct xfs_attr_sf_hdr);
-	int			size = 0;
-	int			end;
-	int			i;
+	struct xfs_attr_sf_hdr		*sf = args->dp->i_af.if_data;
+	struct xfs_attr_sf_entry	*sfe;
 
-	sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
-	sfe = &sf->list[0];
-	end = sf->hdr.count;
-	for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
-			     base += size, i++) {
-		size = xfs_attr_sf_entsize(sfe);
-		if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
-				    sfe->flags))
-			continue;
-		break;
+	for (sfe = xfs_attr_sf_firstentry(sf);
+	     sfe < xfs_attr_sf_endptr(sf);
+	     sfe = xfs_attr_sf_nextentry(sfe)) {
+		if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+				sfe->flags))
+			return sfe;
 	}
 
-	if (sfep != NULL)
-		*sfep = sfe;
-
-	if (basep != NULL)
-		*basep = base;
-
-	if (i == end)
-		return -ENOATTR;
-	return -EEXIST;
+	return NULL;
 }
 
 /*
@@ -751,38 +727,31 @@ xfs_attr_shortform_add(
 	struct xfs_da_args		*args,
 	int				forkoff)
 {
-	struct xfs_attr_shortform	*sf;
+	struct xfs_inode		*dp = args->dp;
+	struct xfs_mount		*mp = dp->i_mount;
+	struct xfs_ifork		*ifp = &dp->i_af;
+	struct xfs_attr_sf_hdr		*sf = ifp->if_data;
 	struct xfs_attr_sf_entry	*sfe;
-	int				offset, size;
-	struct xfs_mount		*mp;
-	struct xfs_inode		*dp;
-	struct xfs_ifork		*ifp;
+	int				size;
 
 	trace_xfs_attr_sf_add(args);
 
-	dp = args->dp;
-	mp = dp->i_mount;
 	dp->i_forkoff = forkoff;
 
-	ifp = &dp->i_af;
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
-	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
-	if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
-		ASSERT(0);
+	ASSERT(!xfs_attr_sf_findname(args));
 
-	offset = (char *)sfe - (char *)sf;
 	size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
-	xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
-	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
-	sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset);
+	sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
 
+	sfe = xfs_attr_sf_endptr(sf);
 	sfe->namelen = args->namelen;
 	sfe->valuelen = args->valuelen;
 	sfe->flags = args->attr_filter;
 	memcpy(sfe->nameval, args->name, args->namelen);
 	memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
-	sf->hdr.count++;
-	be16_add_cpu(&sf->hdr.totsize, size);
+	sf->count++;
+	be16_add_cpu(&sf->totsize, size);
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
 
 	xfs_sbversion_add_attr2(mp, args->trans);
@@ -811,48 +780,43 @@ int
 xfs_attr_sf_removename(
 	struct xfs_da_args		*args)
 {
-	struct xfs_attr_shortform	*sf;
+	struct xfs_inode		*dp = args->dp;
+	struct xfs_mount		*mp = dp->i_mount;
+	struct xfs_attr_sf_hdr		*sf = dp->i_af.if_data;
 	struct xfs_attr_sf_entry	*sfe;
-	int				size = 0, end, totsize;
-	unsigned int			base;
-	struct xfs_mount		*mp;
-	struct xfs_inode		*dp;
-	int				error;
+	uint16_t			totsize = be16_to_cpu(sf->totsize);
+	void				*next, *end;
+	int				size = 0;
 
 	trace_xfs_attr_sf_remove(args);
 
-	dp = args->dp;
-	mp = dp->i_mount;
-	sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
-
-	error = xfs_attr_sf_findname(args, &sfe, &base);
-
-	/*
-	 * If we are recovering an operation, finding nothing to
-	 * remove is not an error - it just means there was nothing
-	 * to clean up.
-	 */
-	if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
-		return 0;
-	if (error != -EEXIST)
-		return error;
-	size = xfs_attr_sf_entsize(sfe);
+	sfe = xfs_attr_sf_findname(args);
+	if (!sfe) {
+		/*
+		 * If we are recovering an operation, finding nothing to remove
+		 * is not an error, it just means there was nothing to clean up.
+		 */
+		if (args->op_flags & XFS_DA_OP_RECOVERY)
+			return 0;
+		return -ENOATTR;
+	}
 
 	/*
 	 * Fix up the attribute fork data, covering the hole
 	 */
-	end = base + size;
-	totsize = be16_to_cpu(sf->hdr.totsize);
-	if (end != totsize)
-		memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
-	sf->hdr.count--;
-	be16_add_cpu(&sf->hdr.totsize, -size);
+	size = xfs_attr_sf_entsize(sfe);
+	next = xfs_attr_sf_nextentry(sfe);
+	end = xfs_attr_sf_endptr(sf);
+	if (next < end)
+		memmove(sfe, next, end - next);
+	sf->count--;
+	totsize -= size;
+	sf->totsize = cpu_to_be16(totsize);
 
 	/*
 	 * Fix up the start offset of the attribute fork
 	 */
-	totsize -= size;
-	if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
+	if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) &&
 	    (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
 	    !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
 		xfs_attr_fork_remove(dp, args->trans);
@@ -860,7 +824,7 @@ xfs_attr_sf_removename(
 		xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
 		dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
 		ASSERT(dp->i_forkoff);
-		ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+		ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) ||
 				(args->op_flags & XFS_DA_OP_ADDNAME) ||
 				!xfs_has_attr2(mp) ||
 				dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
@@ -874,33 +838,6 @@ xfs_attr_sf_removename(
 }
 
 /*
- * Look up a name in a shortform attribute list structure.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_lookup(xfs_da_args_t *args)
-{
-	struct xfs_attr_shortform *sf;
-	struct xfs_attr_sf_entry *sfe;
-	int i;
-	struct xfs_ifork *ifp;
-
-	trace_xfs_attr_sf_lookup(args);
-
-	ifp = &args->dp->i_af;
-	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
-	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
-	sfe = &sf->list[0];
-	for (i = 0; i < sf->hdr.count;
-				sfe = xfs_attr_sf_nextentry(sfe), i++) {
-		if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
-				sfe->flags))
-			return -EEXIST;
-	}
-	return -ENOATTR;
-}
-
-/*
  * Retrieve the attribute value and length.
  *
  * If args->valuelen is zero, only the length needs to be returned.  Unlike a
@@ -909,23 +846,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
  */
 int
 xfs_attr_shortform_getvalue(
-	struct xfs_da_args	*args)
+	struct xfs_da_args		*args)
 {
-	struct xfs_attr_shortform *sf;
-	struct xfs_attr_sf_entry *sfe;
-	int			i;
+	struct xfs_attr_sf_entry	*sfe;
 
 	ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL);
-	sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
-	sfe = &sf->list[0];
-	for (i = 0; i < sf->hdr.count;
-				sfe = xfs_attr_sf_nextentry(sfe), i++) {
-		if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
-				sfe->flags))
-			return xfs_attr_copy_value(args,
-				&sfe->nameval[args->namelen], sfe->valuelen);
-	}
-	return -ENOATTR;
+
+	trace_xfs_attr_sf_lookup(args);
+
+	sfe = xfs_attr_sf_findname(args);
+	if (!sfe)
+		return -ENOATTR;
+	return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
+			sfe->valuelen);
 }
 
 /* Convert from using the shortform to the leaf format. */
@@ -933,26 +866,23 @@ int
 xfs_attr_shortform_to_leaf(
 	struct xfs_da_args		*args)
 {
-	struct xfs_inode		*dp;
-	struct xfs_attr_shortform	*sf;
+	struct xfs_inode		*dp = args->dp;
+	struct xfs_ifork		*ifp = &dp->i_af;
+	struct xfs_attr_sf_hdr		*sf = ifp->if_data;
 	struct xfs_attr_sf_entry	*sfe;
+	int				size = be16_to_cpu(sf->totsize);
 	struct xfs_da_args		nargs;
 	char				*tmpbuffer;
-	int				error, i, size;
+	int				error, i;
 	xfs_dablk_t			blkno;
 	struct xfs_buf			*bp;
-	struct xfs_ifork		*ifp;
 
 	trace_xfs_attr_sf_to_leaf(args);
 
-	dp = args->dp;
-	ifp = &dp->i_af;
-	sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
-	size = be16_to_cpu(sf->hdr.totsize);
 	tmpbuffer = kmem_alloc(size, 0);
 	ASSERT(tmpbuffer != NULL);
-	memcpy(tmpbuffer, ifp->if_u1.if_data, size);
-	sf = (struct xfs_attr_shortform *)tmpbuffer;
+	memcpy(tmpbuffer, ifp->if_data, size);
+	sf = (struct xfs_attr_sf_hdr *)tmpbuffer;
 
 	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
 	xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
@@ -975,8 +905,8 @@ xfs_attr_shortform_to_leaf(
 	nargs.trans = args->trans;
 	nargs.op_flags = XFS_DA_OP_OKNOENT;
 
-	sfe = &sf->list[0];
-	for (i = 0; i < sf->hdr.count; i++) {
+	sfe = xfs_attr_sf_firstentry(sf);
+	for (i = 0; i < sf->count; i++) {
 		nargs.name = sfe->nameval;
 		nargs.namelen = sfe->namelen;
 		nargs.value = &sfe->nameval[nargs.namelen];
@@ -1040,23 +970,16 @@ xfs_attr_shortform_allfit(
 	return xfs_attr_shortform_bytesfit(dp, bytes);
 }
 
-/* Verify the consistency of an inline attribute fork. */
+/* Verify the consistency of a raw inline attribute fork. */
 xfs_failaddr_t
 xfs_attr_shortform_verify(
-	struct xfs_inode		*ip)
+	struct xfs_attr_sf_hdr		*sfp,
+	size_t				size)
 {
-	struct xfs_attr_shortform	*sfp;
-	struct xfs_attr_sf_entry	*sfep;
+	struct xfs_attr_sf_entry	*sfep = xfs_attr_sf_firstentry(sfp);
 	struct xfs_attr_sf_entry	*next_sfep;
 	char				*endp;
-	struct xfs_ifork		*ifp;
 	int				i;
-	int64_t				size;
-
-	ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL);
-	ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
-	sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
-	size = ifp->if_bytes;
 
 	/*
 	 * Give up if the attribute is way too short.
@@ -1067,8 +990,7 @@ xfs_attr_shortform_verify(
 	endp = (char *)sfp + size;
 
 	/* Check all reported entries */
-	sfep = &sfp->list[0];
-	for (i = 0; i < sfp->hdr.count; i++) {
+	for (i = 0; i < sfp->count; i++) {
 		/*
 		 * struct xfs_attr_sf_entry has a variable length.
 		 * Check the fixed-offset parts of the structure are
@@ -1244,14 +1166,10 @@ xfs_attr3_leaf_to_node(
 	if (error)
 		goto out;
 
-	/* copy leaf to new buffer, update identifiers */
-	xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
-	bp2->b_ops = bp1->b_ops;
-	memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
-	if (xfs_has_crc(mp)) {
-		struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
-		hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2));
-	}
+	/*
+	 * Copy leaf to new buffer and log it.
+	 */
+	xfs_da_buf_copy(bp2, bp1, args->geo->blksize);
 	xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
 
 	/*
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 368f4d9fa1d5..9b9948639c0f 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -47,16 +47,14 @@ struct xfs_attr3_icleaf_hdr {
  */
 void	xfs_attr_shortform_create(struct xfs_da_args *args);
 void	xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
-int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
 int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
 int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
 int	xfs_attr_sf_removename(struct xfs_da_args *args);
-int	xfs_attr_sf_findname(struct xfs_da_args *args,
-			     struct xfs_attr_sf_entry **sfep,
-			     unsigned int *basep);
+struct xfs_attr_sf_entry *xfs_attr_sf_findname(struct xfs_da_args *args);
 int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
 int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
-xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_sf_hdr *sfp,
+		size_t size);
 void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
 
 /*
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 37578b369d9b..bc4422223024 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -7,14 +7,6 @@
 #define	__XFS_ATTR_SF_H__
 
 /*
- * Attribute storage when stored inside the inode.
- *
- * Small attribute lists are packed as tightly as possible so as
- * to fit into the literal area of the inode.
- */
-typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
-
-/*
  * We generate this then sort it, attr_list() must return things in hash-order.
  */
 typedef struct xfs_attr_sf_sort {
@@ -41,11 +33,25 @@ static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep)
 	return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen);
 }
 
-/* next entry in struct */
+/* first entry in the SF attr fork */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_firstentry(struct xfs_attr_sf_hdr *hdr)
+{
+	return (struct xfs_attr_sf_entry *)(hdr + 1);
+}
+
+/* next entry after sfep */
 static inline struct xfs_attr_sf_entry *
 xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep)
 {
 	return (void *)sfep + xfs_attr_sf_entsize(sfep);
 }
 
+/* pointer to the space after the last entry, e.g. for adding a new one */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_endptr(struct xfs_attr_sf_hdr *sf)
+{
+	return (void *)sf + be16_to_cpu(sf->totsize);
+}
+
 #endif	/* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index be62acffad6c..98aaca933bdd 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -575,7 +575,7 @@ xfs_bmap_btree_to_extents(
 
 	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
 	error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo,
-			XFS_AG_RESV_NONE);
+			XFS_AG_RESV_NONE, false);
 	if (error)
 		return error;
 
@@ -747,7 +747,7 @@ xfs_bmap_local_to_extents_empty(
 	ASSERT(ifp->if_nextents == 0);
 
 	xfs_bmap_forkoff_reset(ip, whichfork);
-	ifp->if_u1.if_root = NULL;
+	ifp->if_data = NULL;
 	ifp->if_height = 0;
 	ifp->if_format = XFS_DINODE_FMT_EXTENTS;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -832,7 +832,7 @@ xfs_bmap_local_to_extents(
 	xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
 	flags |= XFS_ILOG_CORE;
 
-	ifp->if_u1.if_root = NULL;
+	ifp->if_data = NULL;
 	ifp->if_height = 0;
 
 	rec.br_startoff = 0;
@@ -3044,7 +3044,8 @@ xfs_bmap_extsize_align(
 
 #define XFS_ALLOC_GAP_UNITS	4
 
-void
+/* returns true if ap->blkno was modified */
+bool
 xfs_bmap_adjacent(
 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
 {
@@ -3079,13 +3080,14 @@ xfs_bmap_adjacent(
 		if (adjust &&
 		    ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
 			ap->blkno += adjust;
+		return true;
 	}
 	/*
 	 * If not at eof, then compare the two neighbor blocks.
 	 * Figure out whether either one gives us a good starting point,
 	 * and pick the better one.
 	 */
-	else if (!ap->eof) {
+	if (!ap->eof) {
 		xfs_fsblock_t	gotbno;		/* right side block number */
 		xfs_fsblock_t	gotdiff=0;	/* right side difference */
 		xfs_fsblock_t	prevbno;	/* left side block number */
@@ -3165,14 +3167,21 @@ xfs_bmap_adjacent(
 		 * If both valid, pick the better one, else the only good
 		 * one, else ap->blkno is already set (to 0 or the inode block).
 		 */
-		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) {
 			ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
-		else if (prevbno != NULLFSBLOCK)
+			return true;
+		}
+		if (prevbno != NULLFSBLOCK) {
 			ap->blkno = prevbno;
-		else if (gotbno != NULLFSBLOCK)
+			return true;
+		}
+		if (gotbno != NULLFSBLOCK) {
 			ap->blkno = gotbno;
+			return true;
+		}
 	}
 #undef ISVALID
+	return false;
 }
 
 int
@@ -3263,11 +3272,14 @@ xfs_bmap_btalloc_select_lengths(
 }
 
 /* Update all inode and quota accounting for the allocation we just did. */
-static void
-xfs_bmap_btalloc_accounting(
-	struct xfs_bmalloca	*ap,
-	struct xfs_alloc_arg	*args)
+void
+xfs_bmap_alloc_account(
+	struct xfs_bmalloca	*ap)
 {
+	bool			isrt = XFS_IS_REALTIME_INODE(ap->ip) &&
+					(ap->flags & XFS_BMAPI_ATTRFORK);
+	uint			fld;
+
 	if (ap->flags & XFS_BMAPI_COWFORK) {
 		/*
 		 * COW fork blocks are in-core only and thus are treated as
@@ -3279,7 +3291,7 @@ xfs_bmap_btalloc_accounting(
 		 * yet.
 		 */
 		if (ap->wasdel) {
-			xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+			xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
 			return;
 		}
 
@@ -3291,22 +3303,25 @@ xfs_bmap_btalloc_accounting(
 		 * This essentially transfers the transaction quota reservation
 		 * to that of a delalloc extent.
 		 */
-		ap->ip->i_delayed_blks += args->len;
-		xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS,
-				-(long)args->len);
+		ap->ip->i_delayed_blks += ap->length;
+		xfs_trans_mod_dquot_byino(ap->tp, ap->ip, isrt ?
+				XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS,
+				-(long)ap->length);
 		return;
 	}
 
 	/* data/attr fork only */
-	ap->ip->i_nblocks += args->len;
+	ap->ip->i_nblocks += ap->length;
 	xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
 	if (ap->wasdel) {
-		ap->ip->i_delayed_blks -= args->len;
-		xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+		ap->ip->i_delayed_blks -= ap->length;
+		xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
+		fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT;
+	} else {
+		fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
 	}
-	xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
-		ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT,
-		args->len);
+
+	xfs_trans_mod_dquot_byino(ap->tp, ap->ip, fld, ap->length);
 }
 
 static int
@@ -3380,7 +3395,7 @@ xfs_bmap_process_allocated_extent(
 		ap->offset = orig_offset;
 	else if (ap->offset + ap->length < orig_offset + orig_length)
 		ap->offset = orig_offset + orig_length - ap->length;
-	xfs_bmap_btalloc_accounting(ap, args);
+	xfs_bmap_alloc_account(ap);
 }
 
 #ifdef DEBUG
@@ -5010,7 +5025,6 @@ xfs_bmap_del_extent_real(
 	xfs_fileoff_t		del_endoff;	/* first offset past del */
 	int			do_fx;	/* free extent at end of routine */
 	int			error;	/* error return value */
-	int			flags = 0;/* inode logging flags */
 	struct xfs_bmbt_irec	got;	/* current extent entry */
 	xfs_fileoff_t		got_endoff;	/* first offset past got */
 	int			i;	/* temp state */
@@ -5023,6 +5037,8 @@ xfs_bmap_del_extent_real(
 	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
 	struct xfs_bmbt_irec	old;
 
+	*logflagsp = 0;
+
 	mp = ip->i_mount;
 	XFS_STATS_INC(mp, xs_del_exlist);
 
@@ -5035,7 +5051,6 @@ xfs_bmap_del_extent_real(
 	ASSERT(got_endoff >= del_endoff);
 	ASSERT(!isnullstartblock(got.br_startblock));
 	qfield = 0;
-	error = 0;
 
 	/*
 	 * If it's the case where the directory code is running with no block
@@ -5051,13 +5066,13 @@ xfs_bmap_del_extent_real(
 	    del->br_startoff > got.br_startoff && del_endoff < got_endoff)
 		return -ENOSPC;
 
-	flags = XFS_ILOG_CORE;
+	*logflagsp = XFS_ILOG_CORE;
 	if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
 		if (!(bflags & XFS_BMAPI_REMAP)) {
 			error = xfs_rtfree_blocks(tp, del->br_startblock,
 					del->br_blockcount);
 			if (error)
-				goto done;
+				return error;
 		}
 
 		do_fx = 0;
@@ -5072,11 +5087,9 @@ xfs_bmap_del_extent_real(
 	if (cur) {
 		error = xfs_bmbt_lookup_eq(cur, &got, &i);
 		if (error)
-			goto done;
-		if (XFS_IS_CORRUPT(mp, i != 1)) {
-			error = -EFSCORRUPTED;
-			goto done;
-		}
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 1))
+			return -EFSCORRUPTED;
 	}
 
 	if (got.br_startoff == del->br_startoff)
@@ -5093,17 +5106,15 @@ xfs_bmap_del_extent_real(
 		xfs_iext_prev(ifp, icur);
 		ifp->if_nextents--;
 
-		flags |= XFS_ILOG_CORE;
+		*logflagsp |= XFS_ILOG_CORE;
 		if (!cur) {
-			flags |= xfs_ilog_fext(whichfork);
+			*logflagsp |= xfs_ilog_fext(whichfork);
 			break;
 		}
 		if ((error = xfs_btree_delete(cur, &i)))
-			goto done;
-		if (XFS_IS_CORRUPT(mp, i != 1)) {
-			error = -EFSCORRUPTED;
-			goto done;
-		}
+			return error;
+		if (XFS_IS_CORRUPT(mp, i != 1))
+			return -EFSCORRUPTED;
 		break;
 	case BMAP_LEFT_FILLING:
 		/*
@@ -5114,12 +5125,12 @@ xfs_bmap_del_extent_real(
 		got.br_blockcount -= del->br_blockcount;
 		xfs_iext_update_extent(ip, state, icur, &got);
 		if (!cur) {
-			flags |= xfs_ilog_fext(whichfork);
+			*logflagsp |= xfs_ilog_fext(whichfork);
 			break;
 		}
 		error = xfs_bmbt_update(cur, &got);
 		if (error)
-			goto done;
+			return error;
 		break;
 	case BMAP_RIGHT_FILLING:
 		/*
@@ -5128,12 +5139,12 @@ xfs_bmap_del_extent_real(
 		got.br_blockcount -= del->br_blockcount;
 		xfs_iext_update_extent(ip, state, icur, &got);
 		if (!cur) {
-			flags |= xfs_ilog_fext(whichfork);
+			*logflagsp |= xfs_ilog_fext(whichfork);
 			break;
 		}
 		error = xfs_bmbt_update(cur, &got);
 		if (error)
-			goto done;
+			return error;
 		break;
 	case 0:
 		/*
@@ -5150,18 +5161,18 @@ xfs_bmap_del_extent_real(
 		new.br_state = got.br_state;
 		new.br_startblock = del_endblock;
 
-		flags |= XFS_ILOG_CORE;
+		*logflagsp |= XFS_ILOG_CORE;
 		if (cur) {
 			error = xfs_bmbt_update(cur, &got);
 			if (error)
-				goto done;
+				return error;
 			error = xfs_btree_increment(cur, 0, &i);
 			if (error)
-				goto done;
+				return error;
 			cur->bc_rec.b = new;
 			error = xfs_btree_insert(cur, &i);
 			if (error && error != -ENOSPC)
-				goto done;
+				return error;
 			/*
 			 * If get no-space back from btree insert, it tried a
 			 * split, and we have a zero block reservation.  Fix up
@@ -5174,33 +5185,28 @@ xfs_bmap_del_extent_real(
 				 */
 				error = xfs_bmbt_lookup_eq(cur, &got, &i);
 				if (error)
-					goto done;
-				if (XFS_IS_CORRUPT(mp, i != 1)) {
-					error = -EFSCORRUPTED;
-					goto done;
-				}
+					return error;
+				if (XFS_IS_CORRUPT(mp, i != 1))
+					return -EFSCORRUPTED;
 				/*
 				 * Update the btree record back
 				 * to the original value.
 				 */
 				error = xfs_bmbt_update(cur, &old);
 				if (error)
-					goto done;
+					return error;
 				/*
 				 * Reset the extent record back
 				 * to the original value.
 				 */
 				xfs_iext_update_extent(ip, state, icur, &old);
-				flags = 0;
-				error = -ENOSPC;
-				goto done;
-			}
-			if (XFS_IS_CORRUPT(mp, i != 1)) {
-				error = -EFSCORRUPTED;
-				goto done;
+				*logflagsp = 0;
+				return -ENOSPC;
 			}
+			if (XFS_IS_CORRUPT(mp, i != 1))
+				return -EFSCORRUPTED;
 		} else
-			flags |= xfs_ilog_fext(whichfork);
+			*logflagsp |= xfs_ilog_fext(whichfork);
 
 		ifp->if_nextents++;
 		xfs_iext_next(ifp, icur);
@@ -5218,13 +5224,13 @@ xfs_bmap_del_extent_real(
 		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
 			xfs_refcount_decrease_extent(tp, del);
 		} else {
-			error = __xfs_free_extent_later(tp, del->br_startblock,
+			error = xfs_free_extent_later(tp, del->br_startblock,
 					del->br_blockcount, NULL,
 					XFS_AG_RESV_NONE,
 					((bflags & XFS_BMAPI_NODISCARD) ||
 					del->br_state == XFS_EXT_UNWRITTEN));
 			if (error)
-				goto done;
+				return error;
 		}
 	}
 
@@ -5239,9 +5245,7 @@ xfs_bmap_del_extent_real(
 	if (qfield && !(bflags & XFS_BMAPI_REMAP))
 		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
 
-done:
-	*logflagsp = flags;
-	return error;
+	return 0;
 }
 
 /*
@@ -5250,7 +5254,7 @@ done:
  * that value.  If not all extents in the block range can be removed then
  * *done is set.
  */
-int						/* error */
+static int
 __xfs_bunmapi(
 	struct xfs_trans	*tp,		/* transaction pointer */
 	struct xfs_inode	*ip,		/* incore inode */
@@ -6102,7 +6106,7 @@ __xfs_bmap_add(
 	bi->bi_bmap = *bmap;
 
 	xfs_bmap_update_get_group(tp->t_mountp, bi);
-	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
+	xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
 	return 0;
 }
 
@@ -6179,19 +6183,18 @@ xfs_bmap_finish_one(
 	return error;
 }
 
-/* Check that an inode's extent does not have invalid flags or bad ranges. */
+/* Check that an extent does not have invalid flags or bad ranges. */
 xfs_failaddr_t
-xfs_bmap_validate_extent(
-	struct xfs_inode	*ip,
+xfs_bmap_validate_extent_raw(
+	struct xfs_mount	*mp,
+	bool			rtfile,
 	int			whichfork,
 	struct xfs_bmbt_irec	*irec)
 {
-	struct xfs_mount	*mp = ip->i_mount;
-
 	if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
 		return __this_address;
 
-	if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) {
+	if (rtfile && whichfork == XFS_DATA_FORK) {
 		if (!xfs_verify_rtbext(mp, irec->br_startblock,
 					   irec->br_blockcount))
 			return __this_address;
@@ -6221,3 +6224,53 @@ xfs_bmap_intent_destroy_cache(void)
 	kmem_cache_destroy(xfs_bmap_intent_cache);
 	xfs_bmap_intent_cache = NULL;
 }
+
+/* Check that an inode's extent does not have invalid flags or bad ranges. */
+xfs_failaddr_t
+xfs_bmap_validate_extent(
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*irec)
+{
+	return xfs_bmap_validate_extent_raw(ip->i_mount,
+			XFS_IS_REALTIME_INODE(ip), whichfork, irec);
+}
+
+/*
+ * Used in xfs_itruncate_extents().  This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define	XFS_ITRUNC_MAX_EXTENTS	2
+
+/*
+ * Unmap every extent in part of an inode's fork.  We don't do any higher level
+ * invalidation work at all.
+ */
+int
+xfs_bunmapi_range(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip,
+	uint32_t		flags,
+	xfs_fileoff_t		startoff,
+	xfs_fileoff_t		endoff)
+{
+	xfs_filblks_t		unmap_len = endoff - startoff + 1;
+	int			error = 0;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	while (unmap_len > 0) {
+		ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
+		error = __xfs_bunmapi(*tpp, ip, startoff, &unmap_len, flags,
+				XFS_ITRUNC_MAX_EXTENTS);
+		if (error)
+			goto out;
+
+		/* free the just unmapped extents */
+		error = xfs_defer_finish(tpp);
+		if (error)
+			goto out;
+	}
+out:
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e33470e39728..f6b73f1bad5f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -116,6 +116,8 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
 	return XFS_DATA_FORK;
 }
 
+void xfs_bmap_alloc_account(struct xfs_bmalloca *ap);
+
 /*
  * Special values for xfs_bmbt_irec_t br_startblock field.
  */
@@ -190,9 +192,6 @@ int	xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
 int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
 		xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
-int	__xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
-		xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
-		xfs_extnum_t nexts);
 int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
 		xfs_extnum_t nexts, int *done);
@@ -263,6 +262,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
 	}
 }
 
+xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile,
+		int whichfork, struct xfs_bmbt_irec *irec);
 xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *irec);
 int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork,
@@ -271,6 +272,8 @@ int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork,
 int	xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
 		uint32_t flags);
+int	xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip,
+		uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff);
 
 extern struct kmem_cache	*xfs_bmap_intent_cache;
 
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index bf3f1b36fdd2..71f2d50f7823 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -15,6 +15,7 @@
 #include "xfs_trans.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
+#include "xfs_btree_staging.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
@@ -272,7 +273,7 @@ xfs_bmbt_free_block(
 
 	xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
 	error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo,
-			XFS_AG_RESV_NONE);
+			XFS_AG_RESV_NONE, false);
 	if (error)
 		return error;
 
@@ -288,10 +289,7 @@ xfs_bmbt_get_minrecs(
 	int			level)
 {
 	if (level == cur->bc_nlevels - 1) {
-		struct xfs_ifork	*ifp;
-
-		ifp = xfs_ifork_ptr(cur->bc_ino.ip,
-				    cur->bc_ino.whichfork);
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
 
 		return xfs_bmbt_maxrecs(cur->bc_mp,
 					ifp->if_broot_bytes, level == 0) / 2;
@@ -306,10 +304,7 @@ xfs_bmbt_get_maxrecs(
 	int			level)
 {
 	if (level == cur->bc_nlevels - 1) {
-		struct xfs_ifork	*ifp;
-
-		ifp = xfs_ifork_ptr(cur->bc_ino.ip,
-				    cur->bc_ino.whichfork);
+		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
 
 		return xfs_bmbt_maxrecs(cur->bc_mp,
 					ifp->if_broot_bytes, level == 0);
@@ -543,23 +538,19 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
 	.keys_contiguous	= xfs_bmbt_keys_contiguous,
 };
 
-/*
- * Allocate a new bmap btree cursor.
- */
-struct xfs_btree_cur *				/* new bmap btree cursor */
-xfs_bmbt_init_cursor(
-	struct xfs_mount	*mp,		/* file system mount point */
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_inode	*ip,		/* inode owning the btree */
-	int			whichfork)	/* data or attr fork */
+static struct xfs_btree_cur *
+xfs_bmbt_init_common(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork)
 {
-	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_cur	*cur;
+
 	ASSERT(whichfork != XFS_COW_FORK);
 
 	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
 			mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
-	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
 	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
 
 	cur->bc_ops = &xfs_bmbt_ops;
@@ -567,10 +558,30 @@ xfs_bmbt_init_cursor(
 	if (xfs_has_crc(mp))
 		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
 
-	cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
 	cur->bc_ino.ip = ip;
 	cur->bc_ino.allocated = 0;
 	cur->bc_ino.flags = 0;
+
+	return cur;
+}
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *
+xfs_bmbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_bmbt_init_common(mp, tp, ip, whichfork);
+
+	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+	cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
 	cur->bc_ino.whichfork = whichfork;
 
 	return cur;
@@ -588,6 +599,76 @@ xfs_bmbt_block_maxrecs(
 }
 
 /*
+ * Allocate a new bmap btree cursor for reloading an inode block mapping data
+ * structure.  Note that callers can use the staged cursor to reload extents
+ * format inode forks if they rebuild the iext tree and commit the staged
+ * cursor immediately.
+ */
+struct xfs_btree_cur *
+xfs_bmbt_stage_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
+	struct xbtree_ifakeroot	*ifake)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_btree_ops	*ops;
+
+	/* data fork always has larger maxheight */
+	cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK);
+	cur->bc_nlevels = ifake->if_levels;
+	cur->bc_ino.forksize = ifake->if_fork_size;
+
+	/* Don't let anyone think we're attached to the real fork yet. */
+	cur->bc_ino.whichfork = -1;
+	xfs_btree_stage_ifakeroot(cur, ifake, &ops);
+	ops->update_cursor = NULL;
+	return cur;
+}
+
+/*
+ * Swap in the new inode fork root.  Once we pass this point the newly rebuilt
+ * mappings are in place and we have to kill off any old btree blocks.
+ */
+void
+xfs_bmbt_commit_staged_btree(
+	struct xfs_btree_cur	*cur,
+	struct xfs_trans	*tp,
+	int			whichfork)
+{
+	struct xbtree_ifakeroot	*ifake = cur->bc_ino.ifake;
+	struct xfs_ifork	*ifp;
+	static const short	brootflag[2] = {XFS_ILOG_DBROOT, XFS_ILOG_ABROOT};
+	static const short	extflag[2] = {XFS_ILOG_DEXT, XFS_ILOG_AEXT};
+	int			flags = XFS_ILOG_CORE;
+
+	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+	ASSERT(whichfork != XFS_COW_FORK);
+
+	/*
+	 * Free any resources hanging off the real fork, then shallow-copy the
+	 * staging fork's contents into the real fork to transfer everything
+	 * we just built.
+	 */
+	ifp = xfs_ifork_ptr(cur->bc_ino.ip, whichfork);
+	xfs_idestroy_fork(ifp);
+	memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		flags |= extflag[whichfork];
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		flags |= brootflag[whichfork];
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+	xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops);
+}
+
+/*
  * Calculate number of records in a bmap btree block.
  */
 int
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 3e7a40a83835..151b8491f60e 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -11,6 +11,7 @@ struct xfs_btree_block;
 struct xfs_mount;
 struct xfs_inode;
 struct xfs_trans;
+struct xbtree_ifakeroot;
 
 /*
  * Btree block header size depends on a superblock flag.
@@ -106,6 +107,10 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
 
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_inode *, int);
+struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp,
+		struct xfs_inode *ip, struct xbtree_ifakeroot *ifake);
+void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur,
+		struct xfs_trans *tp, int whichfork);
 
 extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 6a6503ab0cd7..ea8d3659df20 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1330,7 +1330,7 @@ xfs_btree_get_buf_block(
  * Read in the buffer at the given ptr and return the buffer and
  * the block pointer within the buffer.
  */
-STATIC int
+int
 xfs_btree_read_buf_block(
 	struct xfs_btree_cur		*cur,
 	const union xfs_btree_ptr	*ptr,
@@ -5212,3 +5212,29 @@ xfs_btree_destroy_cur_caches(void)
 	xfs_rmapbt_destroy_cur_cache();
 	xfs_refcountbt_destroy_cur_cache();
 }
+
+/* Move the btree cursor before the first record. */
+int
+xfs_btree_goto_left_edge(
+	struct xfs_btree_cur	*cur)
+{
+	int			stat = 0;
+	int			error;
+
+	memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
+	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
+	if (error)
+		return error;
+	if (!stat)
+		return 0;
+
+	error = xfs_btree_decrement(cur, 0, &stat);
+	if (error)
+		return error;
+	if (stat != 0) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4d68a58be160..d906324e25c8 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -700,6 +700,9 @@ void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
 int xfs_btree_get_buf_block(struct xfs_btree_cur *cur,
 		const union xfs_btree_ptr *ptr, struct xfs_btree_block **block,
 		struct xfs_buf **bpp);
+int xfs_btree_read_buf_block(struct xfs_btree_cur *cur,
+		const union xfs_btree_ptr *ptr, int flags,
+		struct xfs_btree_block **block, struct xfs_buf **bpp);
 void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
 		struct xfs_btree_block *block, const union xfs_btree_ptr *ptr,
 		int lr);
@@ -735,4 +738,6 @@ xfs_btree_alloc_cursor(
 int __init xfs_btree_init_cur_caches(void);
 void xfs_btree_destroy_cur_caches(void);
 
+int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index dd75e208b543..e276eba87cb1 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -333,20 +333,41 @@ xfs_btree_commit_ifakeroot(
 /*
  * Put a btree block that we're loading onto the ordered list and release it.
  * The btree blocks will be written to disk when bulk loading is finished.
+ * If we reach the dirty buffer threshold, flush them to disk before
+ * continuing.
  */
-static void
+static int
 xfs_btree_bload_drop_buf(
-	struct list_head	*buffers_list,
-	struct xfs_buf		**bpp)
+	struct xfs_btree_bload		*bbl,
+	struct list_head		*buffers_list,
+	struct xfs_buf			**bpp)
 {
-	if (*bpp == NULL)
-		return;
+	struct xfs_buf			*bp = *bpp;
+	int				error;
+
+	if (!bp)
+		return 0;
 
-	if (!xfs_buf_delwri_queue(*bpp, buffers_list))
-		ASSERT(0);
+	/*
+	 * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
+	 * xfs_buf_read will not pointlessly reread the contents from the disk.
+	 */
+	bp->b_flags |= XBF_DONE;
 
-	xfs_buf_relse(*bpp);
+	xfs_buf_delwri_queue_here(bp, buffers_list);
+	xfs_buf_relse(bp);
 	*bpp = NULL;
+	bbl->nr_dirty++;
+
+	if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
+		return 0;
+
+	error = xfs_buf_delwri_submit(buffers_list);
+	if (error)
+		return error;
+
+	bbl->nr_dirty = 0;
+	return 0;
 }
 
 /*
@@ -384,7 +405,7 @@ xfs_btree_bload_prep_block(
 		ASSERT(*bpp == NULL);
 
 		/* Allocate a new incore btree root block. */
-		new_size = bbl->iroot_size(cur, nr_this_block, priv);
+		new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
 		ifp->if_broot = kmem_zalloc(new_size, 0);
 		ifp->if_broot_bytes = (int)new_size;
 
@@ -418,7 +439,10 @@ xfs_btree_bload_prep_block(
 	 */
 	if (*blockp)
 		xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
-	xfs_btree_bload_drop_buf(buffers_list, bpp);
+
+	ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
+	if (ret)
+		return ret;
 
 	/* Initialize the new btree block. */
 	xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
@@ -436,22 +460,19 @@ STATIC int
 xfs_btree_bload_leaf(
 	struct xfs_btree_cur		*cur,
 	unsigned int			recs_this_block,
-	xfs_btree_bload_get_record_fn	get_record,
+	xfs_btree_bload_get_records_fn	get_records,
 	struct xfs_btree_block		*block,
 	void				*priv)
 {
-	unsigned int			j;
+	unsigned int			j = 1;
 	int				ret;
 
 	/* Fill the leaf block with records. */
-	for (j = 1; j <= recs_this_block; j++) {
-		union xfs_btree_rec	*block_rec;
-
-		ret = get_record(cur, priv);
-		if (ret)
+	while (j <= recs_this_block) {
+		ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
+		if (ret < 0)
 			return ret;
-		block_rec = xfs_btree_rec_addr(cur, j, block);
-		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+		j += ret;
 	}
 
 	return 0;
@@ -485,7 +506,12 @@ xfs_btree_bload_node(
 
 		ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
 
-		ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
+		/*
+		 * Read the lower-level block in case the buffer for it has
+		 * been reclaimed.  LRU refs will be set on the block, which is
+		 * desirable if the new btree commits.
+		 */
+		ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
 				&child_bp);
 		if (ret)
 			return ret;
@@ -570,7 +596,14 @@ xfs_btree_bload_level_geometry(
 	unsigned int		desired_npb;
 	unsigned int		maxnr;
 
-	maxnr = cur->bc_ops->get_maxrecs(cur, level);
+	/*
+	 * Compute the absolute maximum number of records that we can store in
+	 * the ondisk block or inode root.
+	 */
+	if (cur->bc_ops->get_dmaxrecs)
+		maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
+	else
+		maxnr = cur->bc_ops->get_maxrecs(cur, level);
 
 	/*
 	 * Compute the number of blocks we need to fill each block with the
@@ -764,6 +797,7 @@ xfs_btree_bload(
 	cur->bc_nlevels = bbl->btree_height;
 	xfs_btree_set_ptr_null(cur, &child_ptr);
 	xfs_btree_set_ptr_null(cur, &ptr);
+	bbl->nr_dirty = 0;
 
 	xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 			&avg_per_block, &blocks, &blocks_with_extra);
@@ -789,7 +823,7 @@ xfs_btree_bload(
 		trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
 				nr_this_block);
 
-		ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
+		ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
 				block, priv);
 		if (ret)
 			goto out;
@@ -802,7 +836,10 @@ xfs_btree_bload(
 			xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
 	}
 	total_blocks += blocks;
-	xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+	ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
+	if (ret)
+		goto out;
 
 	/* Populate the internal btree nodes. */
 	for (level = 1; level < cur->bc_nlevels; level++) {
@@ -844,7 +881,11 @@ xfs_btree_bload(
 				xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
 		}
 		total_blocks += blocks;
-		xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+		ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
+		if (ret)
+			goto out;
+
 		xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
 	}
 
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
index f0d2976050ae..055ea43b1e18 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.h
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -37,12 +37,6 @@ struct xbtree_ifakeroot {
 
 	/* Number of bytes available for this fork in the inode. */
 	unsigned int		if_fork_size;
-
-	/* Fork format. */
-	unsigned int		if_format;
-
-	/* Number of records. */
-	unsigned int		if_extents;
 };
 
 /* Cursor interactions with fake roots for inode-rooted btrees. */
@@ -53,19 +47,24 @@ void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
 		int whichfork, const struct xfs_btree_ops *ops);
 
 /* Bulk loading of staged btrees. */
-typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv);
+typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur,
+		unsigned int idx, struct xfs_btree_block *block,
+		unsigned int nr_wanted, void *priv);
 typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur,
 		union xfs_btree_ptr *ptr, void *priv);
 typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
-		unsigned int nr_this_level, void *priv);
+		unsigned int level, unsigned int nr_this_level, void *priv);
 
 struct xfs_btree_bload {
 	/*
-	 * This function will be called nr_records times to load records into
-	 * the btree.  The function does this by setting the cursor's bc_rec
-	 * field in in-core format.  Records must be returned in sort order.
+	 * This function will be called to load @nr_wanted records into the
+	 * btree.  The implementation does this by setting the cursor's bc_rec
+	 * field in in-core format and using init_rec_from_cur to set the
+	 * records in the btree block.  Records must be returned in sort order.
+	 * The function must return the number of records loaded or the usual
+	 * negative errno.
 	 */
-	xfs_btree_bload_get_record_fn	get_record;
+	xfs_btree_bload_get_records_fn	get_records;
 
 	/*
 	 * This function will be called nr_blocks times to obtain a pointer
@@ -113,6 +112,16 @@ struct xfs_btree_bload {
 	 * height of the new btree.
 	 */
 	unsigned int			btree_height;
+
+	/*
+	 * Flush the new btree block buffer list to disk after this many blocks
+	 * have been formatted.  Zero prohibits writing any buffers until all
+	 * blocks have been formatted.
+	 */
+	uint16_t			max_dirty;
+
+	/* Number of dirty buffers. */
+	uint16_t			nr_dirty;
 };
 
 int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e576560b46e9..5457188bb4de 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -421,6 +421,25 @@ xfs_da3_node_read_mapped(
 	return xfs_da3_node_set_type(tp, *bpp);
 }
 
+/*
+ * Copy src directory/attr leaf/node buffer to the dst.
+ * For v5 file systems make sure the right blkno is stamped in.
+ */
+void
+xfs_da_buf_copy(
+	struct xfs_buf *dst,
+	struct xfs_buf *src,
+	size_t size)
+{
+	struct xfs_da3_blkinfo *da3 = dst->b_addr;
+
+	memcpy(dst->b_addr, src->b_addr, size);
+	dst->b_ops = src->b_ops;
+	xfs_trans_buf_copy_type(dst, src);
+	if (xfs_has_crc(dst->b_mount))
+		da3->blkno = cpu_to_be64(xfs_buf_daddr(dst));
+}
+
 /*========================================================================
  * Routines used for growing the Btree.
  *========================================================================*/
@@ -690,12 +709,6 @@ xfs_da3_root_split(
 		btree = icnodehdr.btree;
 		size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
 		level = icnodehdr.level;
-
-		/*
-		 * we are about to copy oldroot to bp, so set up the type
-		 * of bp while we know exactly what it will be.
-		 */
-		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
 	} else {
 		struct xfs_dir3_icleaf_hdr leafhdr;
 
@@ -707,31 +720,17 @@ xfs_da3_root_split(
 		size = (int)((char *)&leafhdr.ents[leafhdr.count] -
 			(char *)leaf);
 		level = 0;
-
-		/*
-		 * we are about to copy oldroot to bp, so set up the type
-		 * of bp while we know exactly what it will be.
-		 */
-		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
 	}
 
 	/*
-	 * we can copy most of the information in the node from one block to
-	 * another, but for CRC enabled headers we have to make sure that the
-	 * block specific identifiers are kept intact. We update the buffer
-	 * directly for this.
+	 * Copy old root to new buffer and log it.
 	 */
-	memcpy(node, oldroot, size);
-	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
-	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
-		struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
-
-		node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
-	}
+	xfs_da_buf_copy(bp, blk1->bp, size);
 	xfs_trans_log_buf(tp, bp, 0, size - 1);
 
-	bp->b_ops = blk1->bp->b_ops;
-	xfs_trans_buf_copy_type(bp, blk1->bp);
+	/*
+	 * Update blk1 to point to new buffer.
+	 */
 	blk1->bp = bp;
 	blk1->blkno = blkno;
 
@@ -1220,21 +1219,14 @@ xfs_da3_root_join(
 	xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
 
 	/*
-	 * This could be copying a leaf back into the root block in the case of
-	 * there only being a single leaf block left in the tree. Hence we have
-	 * to update the b_ops pointer as well to match the buffer type change
-	 * that could occur. For dir3 blocks we also need to update the block
-	 * number in the buffer header.
+	 * Copy child to root buffer and log it.
 	 */
-	memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
-	root_blk->bp->b_ops = bp->b_ops;
-	xfs_trans_buf_copy_type(root_blk->bp, bp);
-	if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
-		struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
-		da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp));
-	}
+	xfs_da_buf_copy(root_blk->bp, bp, args->geo->blksize);
 	xfs_trans_log_buf(args->trans, root_blk->bp, 0,
 			  args->geo->blksize - 1);
+	/*
+	 * Now we can drop the child buffer.
+	 */
 	error = xfs_da_shrink_inode(args, child, bp);
 	return error;
 }
@@ -2317,9 +2309,10 @@ xfs_da3_swap_lastblock(
 	/*
 	 * Copy the last block into the dead buffer and log it.
 	 */
-	memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+	xfs_da_buf_copy(dead_buf, last_buf, args->geo->blksize);
 	xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
 	dead_info = dead_buf->b_addr;
+
 	/*
 	 * Get values from the moved block.
 	 */
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ffa3df5b2893..706baf36e175 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -219,6 +219,8 @@ int	xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
 		const struct xfs_buf_ops *ops);
 int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 					  struct xfs_buf *dead_buf);
+void	xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src,
+			size_t size);
 
 uint xfs_da_hashname(const uint8_t *name_string, int name_length);
 enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index f9015f88eca7..24f9d1461f9a 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -578,20 +578,25 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
 #define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */
 
 /*
- * Entries are packed toward the top as tight as possible.
- */
-struct xfs_attr_shortform {
-	struct xfs_attr_sf_hdr {	/* constant-structure header block */
-		__be16	totsize;	/* total bytes in shortform list */
-		__u8	count;	/* count of active entries */
-		__u8	padding;
-	} hdr;
-	struct xfs_attr_sf_entry {
-		uint8_t namelen;	/* actual length of name (no NULL) */
-		uint8_t valuelen;	/* actual length of value (no NULL) */
-		uint8_t flags;	/* flags bits (see xfs_attr_leaf.h) */
-		uint8_t nameval[];	/* name & value bytes concatenated */
-	} list[];			/* variable sized array */
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as to fit into the
+ * literal area of the inode.
+ *
+ * These "shortform" attribute forks consist of a single xfs_attr_sf_hdr header
+ * followed by zero or more xfs_attr_sf_entry structures.
+ */
+struct xfs_attr_sf_hdr {	/* constant-structure header block */
+	__be16	totsize;	/* total bytes in shortform list */
+	__u8	count;		/* count of active entries */
+	__u8	padding;
+};
+
+struct xfs_attr_sf_entry {
+	__u8	namelen;	/* actual length of name (no NULL) */
+	__u8	valuelen;	/* actual length of value (no NULL) */
+	__u8	flags;		/* flags bits (XFS_ATTR_*) */
+	__u8	nameval[];	/* name & value bytes concatenated */
 };
 
 typedef struct xfs_attr_leaf_map {	/* RLE map of free bytes */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index f71679ce23b9..66a17910d021 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -26,6 +26,7 @@
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_attr.h"
+#include "xfs_trans_priv.h"
 
 static struct kmem_cache	*xfs_defer_pending_cache;
 
@@ -181,16 +182,89 @@ static struct kmem_cache	*xfs_defer_pending_cache;
  * Note that the continuation requested between t2 and t3 is likely to
  * reoccur.
  */
+STATIC struct xfs_log_item *
+xfs_defer_barrier_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	return NULL;
+}
 
-static const struct xfs_defer_op_type *defer_op_types[] = {
-	[XFS_DEFER_OPS_TYPE_BMAP]	= &xfs_bmap_update_defer_type,
-	[XFS_DEFER_OPS_TYPE_REFCOUNT]	= &xfs_refcount_update_defer_type,
-	[XFS_DEFER_OPS_TYPE_RMAP]	= &xfs_rmap_update_defer_type,
-	[XFS_DEFER_OPS_TYPE_FREE]	= &xfs_extent_free_defer_type,
-	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
-	[XFS_DEFER_OPS_TYPE_ATTR]	= &xfs_attr_defer_type,
+STATIC void
+xfs_defer_barrier_abort_intent(
+	struct xfs_log_item		*intent)
+{
+	/* empty */
+}
+
+STATIC struct xfs_log_item *
+xfs_defer_barrier_create_done(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*intent,
+	unsigned int			count)
+{
+	return NULL;
+}
+
+STATIC int
+xfs_defer_barrier_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+STATIC void
+xfs_defer_barrier_cancel_item(
+	struct list_head		*item)
+{
+	ASSERT(0);
+}
+
+static const struct xfs_defer_op_type xfs_barrier_defer_type = {
+	.max_items	= 1,
+	.create_intent	= xfs_defer_barrier_create_intent,
+	.abort_intent	= xfs_defer_barrier_abort_intent,
+	.create_done	= xfs_defer_barrier_create_done,
+	.finish_item	= xfs_defer_barrier_finish_item,
+	.cancel_item	= xfs_defer_barrier_cancel_item,
 };
 
+/* Create a log intent done item for a log intent item. */
+static inline void
+xfs_defer_create_done(
+	struct xfs_trans		*tp,
+	struct xfs_defer_pending	*dfp)
+{
+	struct xfs_log_item		*lip;
+
+	/* If there is no log intent item, there can be no log done item. */
+	if (!dfp->dfp_intent)
+		return;
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the log intent item and frees the log done item
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+	if (!lip)
+		return;
+
+	tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE;
+	xfs_trans_add_item(tp, lip);
+	set_bit(XFS_LI_DIRTY, &lip->li_flags);
+	dfp->dfp_done = lip;
+}
+
 /*
  * Ensure there's a log intent item associated with this deferred work item if
  * the operation must be restarted on crash.  Returns 1 if there's a log item;
@@ -202,18 +276,21 @@ xfs_defer_create_intent(
 	struct xfs_defer_pending	*dfp,
 	bool				sort)
 {
-	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
 	struct xfs_log_item		*lip;
 
 	if (dfp->dfp_intent)
 		return 1;
 
-	lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort);
+	lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count,
+			sort);
 	if (!lip)
 		return 0;
 	if (IS_ERR(lip))
 		return PTR_ERR(lip);
 
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	xfs_trans_add_item(tp, lip);
+	set_bit(XFS_LI_DIRTY, &lip->li_flags);
 	dfp->dfp_intent = lip;
 	return 1;
 }
@@ -245,23 +322,50 @@ xfs_defer_create_intents(
 	return ret;
 }
 
-STATIC void
+static inline void
 xfs_defer_pending_abort(
 	struct xfs_mount		*mp,
+	struct xfs_defer_pending	*dfp)
+{
+	trace_xfs_defer_pending_abort(mp, dfp);
+
+	if (dfp->dfp_intent && !dfp->dfp_done) {
+		dfp->dfp_ops->abort_intent(dfp->dfp_intent);
+		dfp->dfp_intent = NULL;
+	}
+}
+
+static inline void
+xfs_defer_pending_cancel_work(
+	struct xfs_mount		*mp,
+	struct xfs_defer_pending	*dfp)
+{
+	struct list_head		*pwi;
+	struct list_head		*n;
+
+	trace_xfs_defer_cancel_list(mp, dfp);
+
+	list_del(&dfp->dfp_list);
+	list_for_each_safe(pwi, n, &dfp->dfp_work) {
+		list_del(pwi);
+		dfp->dfp_count--;
+		trace_xfs_defer_cancel_item(mp, dfp, pwi);
+		dfp->dfp_ops->cancel_item(pwi);
+	}
+	ASSERT(dfp->dfp_count == 0);
+	kmem_cache_free(xfs_defer_pending_cache, dfp);
+}
+
+STATIC void
+xfs_defer_pending_abort_list(
+	struct xfs_mount		*mp,
 	struct list_head		*dop_list)
 {
 	struct xfs_defer_pending	*dfp;
-	const struct xfs_defer_op_type	*ops;
 
 	/* Abort intent items that don't have a done item. */
-	list_for_each_entry(dfp, dop_list, dfp_list) {
-		ops = defer_op_types[dfp->dfp_type];
-		trace_xfs_defer_pending_abort(mp, dfp);
-		if (dfp->dfp_intent && !dfp->dfp_done) {
-			ops->abort_intent(dfp->dfp_intent);
-			dfp->dfp_intent = NULL;
-		}
-	}
+	list_for_each_entry(dfp, dop_list, dfp_list)
+		xfs_defer_pending_abort(mp, dfp);
 }
 
 /* Abort all the intents that were committed. */
@@ -271,7 +375,7 @@ xfs_defer_trans_abort(
 	struct list_head		*dop_pending)
 {
 	trace_xfs_defer_trans_abort(tp, _RET_IP_);
-	xfs_defer_pending_abort(tp->t_mountp, dop_pending);
+	xfs_defer_pending_abort_list(tp->t_mountp, dop_pending);
 }
 
 /*
@@ -389,27 +493,31 @@ xfs_defer_cancel_list(
 {
 	struct xfs_defer_pending	*dfp;
 	struct xfs_defer_pending	*pli;
-	struct list_head		*pwi;
-	struct list_head		*n;
-	const struct xfs_defer_op_type	*ops;
 
 	/*
 	 * Free the pending items.  Caller should already have arranged
 	 * for the intent items to be released.
 	 */
-	list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
-		ops = defer_op_types[dfp->dfp_type];
-		trace_xfs_defer_cancel_list(mp, dfp);
-		list_del(&dfp->dfp_list);
-		list_for_each_safe(pwi, n, &dfp->dfp_work) {
-			list_del(pwi);
-			dfp->dfp_count--;
-			trace_xfs_defer_cancel_item(mp, dfp, pwi);
-			ops->cancel_item(pwi);
-		}
-		ASSERT(dfp->dfp_count == 0);
-		kmem_cache_free(xfs_defer_pending_cache, dfp);
+	list_for_each_entry_safe(dfp, pli, dop_list, dfp_list)
+		xfs_defer_pending_cancel_work(mp, dfp);
+}
+
+static inline void
+xfs_defer_relog_intent(
+	struct xfs_trans		*tp,
+	struct xfs_defer_pending	*dfp)
+{
+	struct xfs_log_item		*lip;
+
+	xfs_defer_create_done(tp, dfp);
+
+	lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done);
+	if (lip) {
+		xfs_trans_add_item(tp, lip);
+		set_bit(XFS_LI_DIRTY, &lip->li_flags);
 	}
+	dfp->dfp_done = NULL;
+	dfp->dfp_intent = lip;
 }
 
 /*
@@ -417,7 +525,7 @@ xfs_defer_cancel_list(
  * done item to release the intent item; and then log a new intent item.
  * The caller should provide a fresh transaction and roll it after we're done.
  */
-static int
+static void
 xfs_defer_relog(
 	struct xfs_trans		**tpp,
 	struct list_head		*dfops)
@@ -456,31 +564,28 @@ xfs_defer_relog(
 
 		trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
 		XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
-		dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
-	}
 
-	if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
-		return xfs_defer_trans_roll(tpp);
-	return 0;
+		xfs_defer_relog_intent(*tpp, dfp);
+	}
 }
 
 /*
  * Log an intent-done item for the first pending intent, and finish the work
  * items.
  */
-static int
+int
 xfs_defer_finish_one(
 	struct xfs_trans		*tp,
 	struct xfs_defer_pending	*dfp)
 {
-	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
+	const struct xfs_defer_op_type	*ops = dfp->dfp_ops;
 	struct xfs_btree_cur		*state = NULL;
 	struct list_head		*li, *n;
 	int				error;
 
 	trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
 
-	dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+	xfs_defer_create_done(tp, dfp);
 	list_for_each_safe(li, n, &dfp->dfp_work) {
 		list_del(li);
 		dfp->dfp_count--;
@@ -517,6 +622,24 @@ out:
 	return error;
 }
 
+/* Move all paused deferred work from @tp to @paused_list. */
+static void
+xfs_defer_isolate_paused(
+	struct xfs_trans		*tp,
+	struct list_head		*paused_list)
+{
+	struct xfs_defer_pending	*dfp;
+	struct xfs_defer_pending	*pli;
+
+	list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) {
+		if (!(dfp->dfp_flags & XFS_DEFER_PAUSED))
+			continue;
+
+		list_move_tail(&dfp->dfp_list, paused_list);
+		trace_xfs_defer_isolate_paused(tp->t_mountp, dfp);
+	}
+}
+
 /*
  * Finish all the pending work.  This involves logging intent items for
  * any work items that wandered in since the last transaction roll (if
@@ -532,6 +655,7 @@ xfs_defer_finish_noroll(
 	struct xfs_defer_pending	*dfp = NULL;
 	int				error = 0;
 	LIST_HEAD(dop_pending);
+	LIST_HEAD(dop_paused);
 
 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 
@@ -550,6 +674,8 @@ xfs_defer_finish_noroll(
 		 */
 		int has_intents = xfs_defer_create_intents(*tp);
 
+		xfs_defer_isolate_paused(*tp, &dop_paused);
+
 		list_splice_init(&(*tp)->t_dfops, &dop_pending);
 
 		if (has_intents < 0) {
@@ -562,22 +688,33 @@ xfs_defer_finish_noroll(
 				goto out_shutdown;
 
 			/* Relog intent items to keep the log moving. */
-			error = xfs_defer_relog(tp, &dop_pending);
-			if (error)
-				goto out_shutdown;
+			xfs_defer_relog(tp, &dop_pending);
+			xfs_defer_relog(tp, &dop_paused);
+
+			if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
+				error = xfs_defer_trans_roll(tp);
+				if (error)
+					goto out_shutdown;
+			}
 		}
 
-		dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
-				       dfp_list);
+		dfp = list_first_entry_or_null(&dop_pending,
+				struct xfs_defer_pending, dfp_list);
+		if (!dfp)
+			break;
 		error = xfs_defer_finish_one(*tp, dfp);
 		if (error && error != -EAGAIN)
 			goto out_shutdown;
 	}
 
+	/* Requeue the paused items in the outgoing transaction. */
+	list_splice_tail_init(&dop_paused, &(*tp)->t_dfops);
+
 	trace_xfs_defer_finish_done(*tp, _RET_IP_);
 	return 0;
 
 out_shutdown:
+	list_splice_tail_init(&dop_paused, &dop_pending);
 	xfs_defer_trans_abort(*tp, &dop_pending);
 	xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
 	trace_xfs_defer_finish_error(*tp, error);
@@ -590,6 +727,9 @@ int
 xfs_defer_finish(
 	struct xfs_trans	**tp)
 {
+#ifdef DEBUG
+	struct xfs_defer_pending *dfp;
+#endif
 	int			error;
 
 	/*
@@ -609,7 +749,10 @@ xfs_defer_finish(
 	}
 
 	/* Reset LOWMODE now that we've finished all the dfops. */
-	ASSERT(list_empty(&(*tp)->t_dfops));
+#ifdef DEBUG
+	list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list)
+		ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
+#endif
 	(*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
 	return 0;
 }
@@ -621,48 +764,165 @@ xfs_defer_cancel(
 	struct xfs_mount	*mp = tp->t_mountp;
 
 	trace_xfs_defer_cancel(tp, _RET_IP_);
+	xfs_defer_trans_abort(tp, &tp->t_dfops);
 	xfs_defer_cancel_list(mp, &tp->t_dfops);
 }
 
+/*
+ * Return the last pending work item attached to this transaction if it matches
+ * the deferred op type.
+ */
+static inline struct xfs_defer_pending *
+xfs_defer_find_last(
+	struct xfs_trans		*tp,
+	const struct xfs_defer_op_type	*ops)
+{
+	struct xfs_defer_pending	*dfp = NULL;
+
+	/* No dfops at all? */
+	if (list_empty(&tp->t_dfops))
+		return NULL;
+
+	dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending,
+			dfp_list);
+
+	/* Wrong type? */
+	if (dfp->dfp_ops != ops)
+		return NULL;
+	return dfp;
+}
+
+/*
+ * Decide if we can add a deferred work item to the last dfops item attached
+ * to the transaction.
+ */
+static inline bool
+xfs_defer_can_append(
+	struct xfs_defer_pending	*dfp,
+	const struct xfs_defer_op_type	*ops)
+{
+	/* Already logged? */
+	if (dfp->dfp_intent)
+		return false;
+
+	/* Paused items cannot absorb more work */
+	if (dfp->dfp_flags & XFS_DEFER_PAUSED)
+		return NULL;
+
+	/* Already full? */
+	if (ops->max_items && dfp->dfp_count >= ops->max_items)
+		return false;
+
+	return true;
+}
+
+/* Create a new pending item at the end of the transaction list. */
+static inline struct xfs_defer_pending *
+xfs_defer_alloc(
+	struct xfs_trans		*tp,
+	const struct xfs_defer_op_type	*ops)
+{
+	struct xfs_defer_pending	*dfp;
+
+	dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+			GFP_NOFS | __GFP_NOFAIL);
+	dfp->dfp_ops = ops;
+	INIT_LIST_HEAD(&dfp->dfp_work);
+	list_add_tail(&dfp->dfp_list, &tp->t_dfops);
+
+	return dfp;
+}
+
 /* Add an item for later deferred processing. */
-void
+struct xfs_defer_pending *
 xfs_defer_add(
 	struct xfs_trans		*tp,
-	enum xfs_defer_ops_type		type,
-	struct list_head		*li)
+	struct list_head		*li,
+	const struct xfs_defer_op_type	*ops)
 {
 	struct xfs_defer_pending	*dfp = NULL;
-	const struct xfs_defer_op_type	*ops = defer_op_types[type];
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-	BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
 
-	/*
-	 * Add the item to a pending item at the end of the intake list.
-	 * If the last pending item has the same type, reuse it.  Else,
-	 * create a new pending item at the end of the intake list.
-	 */
-	if (!list_empty(&tp->t_dfops)) {
-		dfp = list_last_entry(&tp->t_dfops,
-				struct xfs_defer_pending, dfp_list);
-		if (dfp->dfp_type != type ||
-		    (ops->max_items && dfp->dfp_count >= ops->max_items))
-			dfp = NULL;
-	}
-	if (!dfp) {
-		dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
-				GFP_NOFS | __GFP_NOFAIL);
-		dfp->dfp_type = type;
-		dfp->dfp_intent = NULL;
-		dfp->dfp_done = NULL;
-		dfp->dfp_count = 0;
-		INIT_LIST_HEAD(&dfp->dfp_work);
-		list_add_tail(&dfp->dfp_list, &tp->t_dfops);
-	}
+	dfp = xfs_defer_find_last(tp, ops);
+	if (!dfp || !xfs_defer_can_append(dfp, ops))
+		dfp = xfs_defer_alloc(tp, ops);
 
-	list_add_tail(li, &dfp->dfp_work);
+	xfs_defer_add_item(dfp, li);
 	trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
-	dfp->dfp_count++;
+	return dfp;
+}
+
+/*
+ * Add a defer ops barrier to force two otherwise adjacent deferred work items
+ * to be tracked separately and have separate log items.
+ */
+void
+xfs_defer_add_barrier(
+	struct xfs_trans		*tp)
+{
+	struct xfs_defer_pending	*dfp;
+
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+	/* If the last defer op added was a barrier, we're done. */
+	dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type);
+	if (dfp)
+		return;
+
+	xfs_defer_alloc(tp, &xfs_barrier_defer_type);
+
+	trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL);
+}
+
+/*
+ * Create a pending deferred work item to replay the recovered intent item
+ * and add it to the list.
+ */
+void
+xfs_defer_start_recovery(
+	struct xfs_log_item		*lip,
+	struct list_head		*r_dfops,
+	const struct xfs_defer_op_type	*ops)
+{
+	struct xfs_defer_pending	*dfp;
+
+	dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+			GFP_NOFS | __GFP_NOFAIL);
+	dfp->dfp_ops = ops;
+	dfp->dfp_intent = lip;
+	INIT_LIST_HEAD(&dfp->dfp_work);
+	list_add_tail(&dfp->dfp_list, r_dfops);
+}
+
+/*
+ * Cancel a deferred work item created to recover a log intent item.  @dfp
+ * will be freed after this function returns.
+ */
+void
+xfs_defer_cancel_recovery(
+	struct xfs_mount		*mp,
+	struct xfs_defer_pending	*dfp)
+{
+	xfs_defer_pending_abort(mp, dfp);
+	xfs_defer_pending_cancel_work(mp, dfp);
+}
+
+/* Replay the deferred work item created from a recovered log intent item. */
+int
+xfs_defer_finish_recovery(
+	struct xfs_mount		*mp,
+	struct xfs_defer_pending	*dfp,
+	struct list_head		*capture_list)
+{
+	const struct xfs_defer_op_type	*ops = dfp->dfp_ops;
+	int				error;
+
+	/* dfp is freed by recover_work and must not be accessed afterwards */
+	error = ops->recover_work(dfp, capture_list);
+	if (error)
+		trace_xlog_intent_recovery_failed(mp, ops, error);
+	return error;
 }
 
 /*
@@ -769,7 +1029,7 @@ xfs_defer_ops_capture_abort(
 {
 	unsigned short			i;
 
-	xfs_defer_pending_abort(mp, &dfc->dfc_dfops);
+	xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops);
 	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
 
 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
@@ -938,3 +1198,36 @@ xfs_defer_destroy_item_caches(void)
 	xfs_rmap_intent_destroy_cache();
 	xfs_defer_destroy_cache();
 }
+
+/*
+ * Mark a deferred work item so that it will be requeued indefinitely without
+ * being finished.  Caller must ensure there are no data dependencies on this
+ * work item in the meantime.
+ */
+void
+xfs_defer_item_pause(
+	struct xfs_trans		*tp,
+	struct xfs_defer_pending	*dfp)
+{
+	ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED));
+
+	dfp->dfp_flags |= XFS_DEFER_PAUSED;
+
+	trace_xfs_defer_item_pause(tp->t_mountp, dfp);
+}
+
+/*
+ * Release a paused deferred work item so that it will be finished during the
+ * next transaction roll.
+ */
+void
+xfs_defer_item_unpause(
+	struct xfs_trans		*tp,
+	struct xfs_defer_pending	*dfp)
+{
+	ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
+
+	dfp->dfp_flags &= ~XFS_DEFER_PAUSED;
+
+	trace_xfs_defer_item_unpause(tp->t_mountp, dfp);
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 8788ad5f6a73..18a9fb92dde8 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -11,19 +11,6 @@ struct xfs_defer_op_type;
 struct xfs_defer_capture;
 
 /*
- * Header for deferred operation list.
- */
-enum xfs_defer_ops_type {
-	XFS_DEFER_OPS_TYPE_BMAP,
-	XFS_DEFER_OPS_TYPE_REFCOUNT,
-	XFS_DEFER_OPS_TYPE_RMAP,
-	XFS_DEFER_OPS_TYPE_FREE,
-	XFS_DEFER_OPS_TYPE_AGFL_FREE,
-	XFS_DEFER_OPS_TYPE_ATTR,
-	XFS_DEFER_OPS_TYPE_MAX,
-};
-
-/*
  * Save a log intent item and a list of extents, so that we can replay
  * whatever action had to happen to the extent list and file the log done
  * item.
@@ -33,19 +20,35 @@ struct xfs_defer_pending {
 	struct list_head		dfp_work;	/* work items */
 	struct xfs_log_item		*dfp_intent;	/* log intent item */
 	struct xfs_log_item		*dfp_done;	/* log done item */
+	const struct xfs_defer_op_type	*dfp_ops;
 	unsigned int			dfp_count;	/* # extent items */
-	enum xfs_defer_ops_type		dfp_type;
+	unsigned int			dfp_flags;
 };
 
-void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
-		struct list_head *h);
+/*
+ * Create a log intent item for this deferred item, but don't actually finish
+ * the work.  Caller must clear this before the final transaction commit.
+ */
+#define XFS_DEFER_PAUSED	(1U << 0)
+
+#define XFS_DEFER_PENDING_STRINGS \
+	{ XFS_DEFER_PAUSED,	"paused" }
+
+void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
+void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
+
+struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, struct list_head *h,
+		const struct xfs_defer_op_type *ops);
 int xfs_defer_finish_noroll(struct xfs_trans **tp);
 int xfs_defer_finish(struct xfs_trans **tp);
+int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
 void xfs_defer_cancel(struct xfs_trans *);
 void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
 
 /* Description of a deferred type. */
 struct xfs_defer_op_type {
+	const char		*name;
+	unsigned int		max_items;
 	struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
 			struct list_head *items, unsigned int count, bool sort);
 	void (*abort_intent)(struct xfs_log_item *intent);
@@ -56,7 +59,11 @@ struct xfs_defer_op_type {
 	void (*finish_cleanup)(struct xfs_trans *tp,
 			struct xfs_btree_cur *state, int error);
 	void (*cancel_item)(struct list_head *item);
-	unsigned int		max_items;
+	int (*recover_work)(struct xfs_defer_pending *dfp,
+			    struct list_head *capture_list);
+	struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp,
+			struct xfs_log_item *intent,
+			struct xfs_log_item *done_item);
 };
 
 extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
@@ -125,7 +132,25 @@ void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
 		struct xfs_defer_capture *d);
 void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
 
+void xfs_defer_start_recovery(struct xfs_log_item *lip,
+		struct list_head *r_dfops, const struct xfs_defer_op_type *ops);
+void xfs_defer_cancel_recovery(struct xfs_mount *mp,
+		struct xfs_defer_pending *dfp);
+int xfs_defer_finish_recovery(struct xfs_mount *mp,
+		struct xfs_defer_pending *dfp, struct list_head *capture_list);
+
+static inline void
+xfs_defer_add_item(
+	struct xfs_defer_pending	*dfp,
+	struct list_head		*work)
+{
+	list_add_tail(work, &dfp->dfp_work);
+	dfp->dfp_count++;
+}
+
 int __init xfs_defer_init_item_caches(void);
 void xfs_defer_destroy_item_caches(void);
 
+void xfs_defer_add_barrier(struct xfs_trans *tp);
+
 #endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index f5462fd582d5..a76673281514 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -196,7 +196,7 @@ xfs_dir_isempty(
 		return 1;
 	if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
 		return 0;
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfp = dp->i_df.if_data;
 	return !sfp->count;
 }
 
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 00f960a703b2..3c256d4cc40b 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1089,7 +1089,7 @@ xfs_dir2_sf_to_block(
 	int			newoffset;	/* offset from current entry */
 	unsigned int		offset = geo->data_entry_offset;
 	xfs_dir2_sf_entry_t	*sfep;		/* sf entry pointer */
-	xfs_dir2_sf_hdr_t	*oldsfp;	/* old shortform header  */
+	struct xfs_dir2_sf_hdr	*oldsfp = ifp->if_data;
 	xfs_dir2_sf_hdr_t	*sfp;		/* shortform header  */
 	__be16			*tagp;		/* end of data entry */
 	struct xfs_name		name;
@@ -1099,10 +1099,8 @@ xfs_dir2_sf_to_block(
 	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
 	ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
 
-	oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
-
 	ASSERT(ifp->if_bytes == dp->i_disk_size);
-	ASSERT(ifp->if_u1.if_data != NULL);
+	ASSERT(oldsfp != NULL);
 	ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
 	ASSERT(dp->i_df.if_nextents == 0);
 
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 7404a9ff1a92..1db2e60ba827 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -175,7 +175,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
 extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
 extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
-extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_mount *mp,
+		struct xfs_dir2_sf_hdr *sfp, int64_t size);
 int xfs_dir2_sf_entsize(struct xfs_mount *mp,
 		struct xfs_dir2_sf_hdr *hdr, int len);
 void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 8cd37e6e9d38..e1f83fc7b6ad 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -364,25 +364,23 @@ int						/* error */
 xfs_dir2_sf_addname(
 	xfs_da_args_t		*args)		/* operation arguments */
 {
-	xfs_inode_t		*dp;		/* incore directory inode */
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_dir2_sf_hdr	*sfp = dp->i_df.if_data;
 	int			error;		/* error return value */
 	int			incr_isize;	/* total change in size */
 	int			new_isize;	/* size after adding name */
 	int			objchange;	/* changing to 8-byte inodes */
 	xfs_dir2_data_aoff_t	offset = 0;	/* offset for new entry */
 	int			pick;		/* which algorithm to use */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
 	xfs_dir2_sf_entry_t	*sfep = NULL;	/* shortform entry */
 
 	trace_xfs_dir2_sf_addname(args);
 
 	ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
-	dp = args->dp;
 	ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
 	ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
 	ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(sfp != NULL);
 	ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
 	/*
 	 * Compute entry (and change in) size.
@@ -462,20 +460,17 @@ xfs_dir2_sf_addname_easy(
 {
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_mount	*mp = dp->i_mount;
-	int			byteoff;	/* byte offset in sf dir */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	struct xfs_dir2_sf_hdr	*sfp = dp->i_df.if_data;
+	int			byteoff = (int)((char *)sfep - (char *)sfp);
 
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	byteoff = (int)((char *)sfep - (char *)sfp);
 	/*
 	 * Grow the in-inode space.
 	 */
-	xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
+	sfp = xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
 			  XFS_DATA_FORK);
 	/*
 	 * Need to set up again due to realloc of the inode data.
 	 */
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
 	/*
 	 * Fill in the new entry.
@@ -528,11 +523,10 @@ xfs_dir2_sf_addname_hard(
 	/*
 	 * Copy the old directory to the stack buffer.
 	 */
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	old_isize = (int)dp->i_disk_size;
 	buf = kmem_alloc(old_isize, 0);
 	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
-	memcpy(oldsfp, sfp, old_isize);
+	memcpy(oldsfp, dp->i_df.if_data, old_isize);
 	/*
 	 * Loop over the old directory finding the place we're going
 	 * to insert the new entry.
@@ -556,11 +550,8 @@ xfs_dir2_sf_addname_hard(
 	 * the data.
 	 */
 	xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
-	xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
-	/*
-	 * Reset the pointer since the buffer was reallocated.
-	 */
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfp = xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+
 	/*
 	 * Copy the first part of the directory, including the header.
 	 */
@@ -610,11 +601,10 @@ xfs_dir2_sf_addname_pick(
 	int			i;		/* entry number */
 	xfs_dir2_data_aoff_t	offset;		/* data block offset */
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	struct xfs_dir2_sf_hdr	*sfp = dp->i_df.if_data;
 	int			size;		/* entry's data size */
 	int			used;		/* data bytes used */
 
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	size = xfs_dir2_data_entsize(mp, args->namelen);
 	offset = args->geo->data_first_offset;
 	sfep = xfs_dir2_sf_firstentry(sfp);
@@ -673,14 +663,13 @@ xfs_dir2_sf_check(
 {
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_dir2_sf_hdr	*sfp = dp->i_df.if_data;
 	int			i;		/* entry number */
 	int			i8count;	/* number of big inode#s */
 	xfs_ino_t		ino;		/* entry inode number */
 	int			offset;		/* data offset */
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform dir entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
 
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	offset = args->geo->data_first_offset;
 	ino = xfs_dir2_sf_get_parent_ino(sfp);
 	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
@@ -707,11 +696,10 @@ xfs_dir2_sf_check(
 /* Verify the consistency of an inline directory. */
 xfs_failaddr_t
 xfs_dir2_sf_verify(
-	struct xfs_inode		*ip)
+	struct xfs_mount		*mp,
+	struct xfs_dir2_sf_hdr		*sfp,
+	int64_t				size)
 {
-	struct xfs_mount		*mp = ip->i_mount;
-	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
-	struct xfs_dir2_sf_hdr		*sfp;
 	struct xfs_dir2_sf_entry	*sfep;
 	struct xfs_dir2_sf_entry	*next_sfep;
 	char				*endp;
@@ -719,15 +707,9 @@ xfs_dir2_sf_verify(
 	int				i;
 	int				i8count;
 	int				offset;
-	int64_t				size;
 	int				error;
 	uint8_t				filetype;
 
-	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
-
-	sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
-	size = ifp->if_bytes;
-
 	/*
 	 * Give up if the directory is way too short.
 	 */
@@ -834,15 +816,13 @@ xfs_dir2_sf_create(
 	ASSERT(dp->i_df.if_bytes == 0);
 	i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
 	size = xfs_dir2_sf_hdr_size(i8count);
+
 	/*
-	 * Make a buffer for the data.
-	 */
-	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-	/*
-	 * Fill in the header,
+	 * Make a buffer for the data and fill in the header.
 	 */
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfp = xfs_idata_realloc(dp, size, XFS_DATA_FORK);
 	sfp->i8count = i8count;
+
 	/*
 	 * Now can put in the inode number, since i8count is set.
 	 */
@@ -864,9 +844,9 @@ xfs_dir2_sf_lookup(
 {
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_dir2_sf_hdr	*sfp = dp->i_df.if_data;
 	int			i;		/* entry index */
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
 	enum xfs_dacmp		cmp;		/* comparison result */
 	xfs_dir2_sf_entry_t	*ci_sfep;	/* case-insens. entry */
 
@@ -877,8 +857,7 @@ xfs_dir2_sf_lookup(
 	ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
 	ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
 	ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(sfp != NULL);
 	ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
 	/*
 	 * Special case for .
@@ -940,13 +919,13 @@ xfs_dir2_sf_removename(
 {
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_dir2_sf_hdr	*sfp = dp->i_df.if_data;
 	int			byteoff;	/* offset of removed entry */
 	int			entsize;	/* this entry's size */
 	int			i;		/* shortform entry index */
 	int			newsize;	/* new inode size */
 	int			oldsize;	/* old inode size */
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
 
 	trace_xfs_dir2_sf_removename(args);
 
@@ -954,8 +933,7 @@ xfs_dir2_sf_removename(
 	oldsize = (int)dp->i_disk_size;
 	ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent));
 	ASSERT(dp->i_df.if_bytes == oldsize);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(sfp != NULL);
 	ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
 	/*
 	 * Loop over the old directory entries.
@@ -992,11 +970,12 @@ xfs_dir2_sf_removename(
 	 */
 	sfp->count--;
 	dp->i_disk_size = newsize;
+
 	/*
 	 * Reallocate, making it smaller.
 	 */
-	xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfp = xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+
 	/*
 	 * Are we changing inode number size?
 	 */
@@ -1019,13 +998,12 @@ xfs_dir2_sf_replace_needblock(
 	struct xfs_inode	*dp,
 	xfs_ino_t		inum)
 {
+	struct xfs_dir2_sf_hdr	*sfp = dp->i_df.if_data;
 	int			newsize;
-	struct xfs_dir2_sf_hdr	*sfp;
 
 	if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL)
 		return false;
 
-	sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
 	newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
 
 	return inum > XFS_DIR2_MAX_SHORT_INUM &&
@@ -1041,19 +1019,18 @@ xfs_dir2_sf_replace(
 {
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_dir2_sf_hdr	*sfp = dp->i_df.if_data;
 	int			i;		/* entry index */
 	xfs_ino_t		ino=0;		/* entry old inode number */
 	int			i8elevated;	/* sf_toino8 set i8count=1 */
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
 
 	trace_xfs_dir2_sf_replace(args);
 
 	ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
 	ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
 	ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(sfp != NULL);
 	ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
 
 	/*
@@ -1076,7 +1053,7 @@ xfs_dir2_sf_replace(
 		 */
 		xfs_dir2_sf_toino8(args);
 		i8elevated = 1;
-		sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+		sfp = dp->i_df.if_data;
 	} else
 		i8elevated = 0;
 
@@ -1157,11 +1134,11 @@ xfs_dir2_sf_toino4(
 {
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_dir2_sf_hdr	*oldsfp = dp->i_df.if_data;
 	char			*buf;		/* old dir's buffer */
 	int			i;		/* entry index */
 	int			newsize;	/* new inode size */
 	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
-	xfs_dir2_sf_hdr_t	*oldsfp;	/* old sf directory */
 	int			oldsize;	/* old inode size */
 	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
 	xfs_dir2_sf_hdr_t	*sfp;		/* new sf directory */
@@ -1175,7 +1152,6 @@ xfs_dir2_sf_toino4(
 	 */
 	oldsize = dp->i_df.if_bytes;
 	buf = kmem_alloc(oldsize, 0);
-	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	ASSERT(oldsfp->i8count == 1);
 	memcpy(buf, oldsfp, oldsize);
 	/*
@@ -1188,7 +1164,7 @@ xfs_dir2_sf_toino4(
 	 * Reset our pointers, the data has moved.
 	 */
 	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfp = dp->i_df.if_data;
 	/*
 	 * Fill in the new header.
 	 */
@@ -1230,11 +1206,11 @@ xfs_dir2_sf_toino8(
 {
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_dir2_sf_hdr	*oldsfp = dp->i_df.if_data;
 	char			*buf;		/* old dir's buffer */
 	int			i;		/* entry index */
 	int			newsize;	/* new inode size */
 	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
-	xfs_dir2_sf_hdr_t	*oldsfp;	/* old sf directory */
 	int			oldsize;	/* old inode size */
 	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
 	xfs_dir2_sf_hdr_t	*sfp;		/* new sf directory */
@@ -1248,7 +1224,6 @@ xfs_dir2_sf_toino8(
 	 */
 	oldsize = dp->i_df.if_bytes;
 	buf = kmem_alloc(oldsize, 0);
-	oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	ASSERT(oldsfp->i8count == 0);
 	memcpy(buf, oldsfp, oldsize);
 	/*
@@ -1261,7 +1236,7 @@ xfs_dir2_sf_toino8(
 	 * Reset our pointers, the data has moved.
 	 */
 	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	sfp = dp->i_df.if_data;
 	/*
 	 * Fill in the new header.
 	 */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9a88aba1589f..382ab1e71c0b 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1008,7 +1008,7 @@ enum xfs_dinode_fmt {
  * Return pointers to the data or attribute forks.
  */
 #define XFS_DFORK_DPTR(dip) \
-	((char *)dip + xfs_dinode_size(dip->di_version))
+	((void *)dip + xfs_dinode_size(dip->di_version))
 #define XFS_DFORK_APTR(dip)	\
 	(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
 #define XFS_DFORK_PTR(dip,w)	\
@@ -1156,20 +1156,6 @@ static inline bool xfs_dinode_has_large_extent_counts(
 #define	XFS_DFL_RTEXTSIZE	(64 * 1024)	        /* 64kB */
 #define	XFS_MIN_RTEXTSIZE	(4 * 1024)		/* 4kB */
 
-#define	XFS_BLOCKSIZE(mp)	((mp)->m_sb.sb_blocksize)
-#define	XFS_BLOCKMASK(mp)	((mp)->m_blockmask)
-
-/*
- * RT bit manipulation macros.
- */
-#define	XFS_RTMIN(a,b)	((a) < (b) ? (a) : (b))
-#define	XFS_RTMAX(a,b)	((a) > (b) ? (a) : (b))
-
-#define	XFS_RTLOBIT(w)	xfs_lowbit32(w)
-#define	XFS_RTHIBIT(w)	xfs_highbit32(w)
-
-#define	XFS_RTBLOCKLOG(b)	xfs_highbit64(b)
-
 /*
  * Dquot and dquot block format definitions
  */
@@ -1272,6 +1258,9 @@ static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
 #define XFS_DQ_GRACE_MIN		((int64_t)0)
 #define XFS_DQ_GRACE_MAX		((int64_t)U32_MAX)
 
+/* Maximum id value for a quota record */
+#define XFS_DQ_ID_MAX			(U32_MAX)
+
 /*
  * This is the main portion of the on-disk representation of quota information
  * for a user.  We pad this with some more expansion room to construct the on
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 99e796256c5d..6296993ff8f3 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -68,6 +68,11 @@ struct xfs_fsop_geom;
 #define XFS_SICK_INO_SYMLINK	(1 << 6)  /* symbolic link remote target */
 #define XFS_SICK_INO_PARENT	(1 << 7)  /* parent pointers */
 
+#define XFS_SICK_INO_BMBTD_ZAPPED	(1 << 8)  /* data fork erased */
+#define XFS_SICK_INO_BMBTA_ZAPPED	(1 << 9)  /* attr fork erased */
+#define XFS_SICK_INO_DIR_ZAPPED		(1 << 10) /* directory erased */
+#define XFS_SICK_INO_SYMLINK_ZAPPED	(1 << 11) /* symlink erased */
+
 /* Primary evidence of health problems in a given group. */
 #define XFS_SICK_FS_PRIMARY	(XFS_SICK_FS_COUNTERS | \
 				 XFS_SICK_FS_UQUOTA | \
@@ -97,6 +102,11 @@ struct xfs_fsop_geom;
 				 XFS_SICK_INO_SYMLINK | \
 				 XFS_SICK_INO_PARENT)
 
+#define XFS_SICK_INO_ZAPPED	(XFS_SICK_INO_BMBTD_ZAPPED | \
+				 XFS_SICK_INO_BMBTA_ZAPPED | \
+				 XFS_SICK_INO_DIR_ZAPPED | \
+				 XFS_SICK_INO_SYMLINK_ZAPPED)
+
 /* These functions must be provided by the xfs implementation. */
 
 void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index b83e54c70906..2361a22035b0 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -95,18 +95,28 @@ xfs_inobt_btrec_to_irec(
 	irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
 }
 
+/* Compute the freecount of an incore inode record. */
+uint8_t
+xfs_inobt_rec_freecount(
+	const struct xfs_inobt_rec_incore	*irec)
+{
+	uint64_t				realfree = irec->ir_free;
+
+	if (xfs_inobt_issparse(irec->ir_holemask))
+		realfree &= xfs_inobt_irec_to_allocmask(irec);
+	return hweight64(realfree);
+}
+
 /* Simple checks for inode records. */
 xfs_failaddr_t
 xfs_inobt_check_irec(
-	struct xfs_btree_cur			*cur,
+	struct xfs_perag			*pag,
 	const struct xfs_inobt_rec_incore	*irec)
 {
-	uint64_t			realfree;
-
 	/* Record has to be properly aligned within the AG. */
-	if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
+	if (!xfs_verify_agino(pag, irec->ir_startino))
 		return __this_address;
-	if (!xfs_verify_agino(cur->bc_ag.pag,
+	if (!xfs_verify_agino(pag,
 				irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
 		return __this_address;
 	if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
@@ -115,12 +125,7 @@ xfs_inobt_check_irec(
 	if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
 		return __this_address;
 
-	/* if there are no holes, return the first available offset */
-	if (!xfs_inobt_issparse(irec->ir_holemask))
-		realfree = irec->ir_free;
-	else
-		realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
-	if (hweight64(realfree) != irec->ir_freecount)
+	if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount)
 		return __this_address;
 
 	return NULL;
@@ -164,7 +169,7 @@ xfs_inobt_get_rec(
 		return error;
 
 	xfs_inobt_btrec_to_irec(mp, rec, irec);
-	fa = xfs_inobt_check_irec(cur, irec);
+	fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec);
 	if (fa)
 		return xfs_inobt_complain_bad_rec(cur, fa, irec);
 
@@ -1854,7 +1859,7 @@ xfs_difree_inode_chunk(
 		return xfs_free_extent_later(tp,
 				XFS_AGB_TO_FSB(mp, agno, sagbno),
 				M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
-				XFS_AG_RESV_NONE);
+				XFS_AG_RESV_NONE, false);
 	}
 
 	/* holemask is only 16-bits (fits in an unsigned long) */
@@ -1900,7 +1905,8 @@ xfs_difree_inode_chunk(
 		ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
 		error = xfs_free_extent_later(tp,
 				XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
-				&XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE);
+				&XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE,
+				false);
 		if (error)
 			return error;
 
@@ -2739,7 +2745,7 @@ xfs_ialloc_count_inodes_rec(
 	xfs_failaddr_t			fa;
 
 	xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
-	fa = xfs_inobt_check_irec(cur, &irec);
+	fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec);
 	if (fa)
 		return xfs_inobt_complain_bad_rec(cur, fa, &irec);
 
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index fe824bb04a09..f1412183bb44 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -79,6 +79,7 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
  */
 int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
 		xfs_inobt_rec_incore_t *rec, int *stat);
+uint8_t xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore *irec);
 
 /*
  * Inode chunk initialisation routine
@@ -93,7 +94,7 @@ union xfs_btree_rec;
 void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
 		const union xfs_btree_rec *rec,
 		struct xfs_inobt_rec_incore *irec);
-xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_inobt_check_irec(struct xfs_perag *pag,
 		const struct xfs_inobt_rec_incore *irec);
 int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
 		xfs_agblock_t bno, xfs_extlen_t len,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9258f01c0015..42a5e1f227a0 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -161,7 +161,7 @@ __xfs_inobt_free_block(
 	xfs_inobt_mod_blockcount(cur, -1);
 	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
 	return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
-			&XFS_RMAP_OINFO_INOBT, resv);
+			&XFS_RMAP_OINFO_INOBT, resv, false);
 }
 
 STATIC int
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 773cf4349428..f4e6b200cdf8 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -158,7 +158,7 @@ static void *
 xfs_iext_find_first_leaf(
 	struct xfs_ifork	*ifp)
 {
-	struct xfs_iext_node	*node = ifp->if_u1.if_root;
+	struct xfs_iext_node	*node = ifp->if_data;
 	int			height;
 
 	if (!ifp->if_height)
@@ -176,7 +176,7 @@ static void *
 xfs_iext_find_last_leaf(
 	struct xfs_ifork	*ifp)
 {
-	struct xfs_iext_node	*node = ifp->if_u1.if_root;
+	struct xfs_iext_node	*node = ifp->if_data;
 	int			height, i;
 
 	if (!ifp->if_height)
@@ -306,7 +306,7 @@ xfs_iext_find_level(
 	xfs_fileoff_t		offset,
 	int			level)
 {
-	struct xfs_iext_node	*node = ifp->if_u1.if_root;
+	struct xfs_iext_node	*node = ifp->if_data;
 	int			height, i;
 
 	if (!ifp->if_height)
@@ -402,12 +402,12 @@ xfs_iext_grow(
 	int			i;
 
 	if (ifp->if_height == 1) {
-		struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
+		struct xfs_iext_leaf *prev = ifp->if_data;
 
 		node->keys[0] = xfs_iext_leaf_key(prev, 0);
 		node->ptrs[0] = prev;
 	} else  {
-		struct xfs_iext_node *prev = ifp->if_u1.if_root;
+		struct xfs_iext_node *prev = ifp->if_data;
 
 		ASSERT(ifp->if_height > 1);
 
@@ -418,7 +418,7 @@ xfs_iext_grow(
 	for (i = 1; i < KEYS_PER_NODE; i++)
 		node->keys[i] = XFS_IEXT_KEY_INVALID;
 
-	ifp->if_u1.if_root = node;
+	ifp->if_data = node;
 	ifp->if_height++;
 }
 
@@ -430,7 +430,7 @@ xfs_iext_update_node(
 	int			level,
 	void			*ptr)
 {
-	struct xfs_iext_node	*node = ifp->if_u1.if_root;
+	struct xfs_iext_node	*node = ifp->if_data;
 	int			height, i;
 
 	for (height = ifp->if_height; height > level; height--) {
@@ -583,11 +583,11 @@ xfs_iext_alloc_root(
 {
 	ASSERT(ifp->if_bytes == 0);
 
-	ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+	ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
 	ifp->if_height = 1;
 
 	/* now that we have a node step into it */
-	cur->leaf = ifp->if_u1.if_root;
+	cur->leaf = ifp->if_data;
 	cur->pos = 0;
 }
 
@@ -603,9 +603,9 @@ xfs_iext_realloc_root(
 	if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
 		new_size = NODE_SIZE;
 
-	new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL);
+	new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL);
 	memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
-	ifp->if_u1.if_root = new;
+	ifp->if_data = new;
 	cur->leaf = new;
 }
 
@@ -622,13 +622,11 @@ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
 }
 
 void
-xfs_iext_insert(
-	struct xfs_inode	*ip,
+xfs_iext_insert_raw(
+	struct xfs_ifork	*ifp,
 	struct xfs_iext_cursor	*cur,
-	struct xfs_bmbt_irec	*irec,
-	int			state)
+	struct xfs_bmbt_irec	*irec)
 {
-	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
 	xfs_fileoff_t		offset = irec->br_startoff;
 	struct xfs_iext_leaf	*new = NULL;
 	int			nr_entries, i;
@@ -662,12 +660,23 @@ xfs_iext_insert(
 	xfs_iext_set(cur_rec(cur), irec);
 	ifp->if_bytes += sizeof(struct xfs_iext_rec);
 
-	trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
-
 	if (new)
 		xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
 }
 
+void
+xfs_iext_insert(
+	struct xfs_inode	*ip,
+	struct xfs_iext_cursor	*cur,
+	struct xfs_bmbt_irec	*irec,
+	int			state)
+{
+	struct xfs_ifork	*ifp = xfs_iext_state_to_fork(ip, state);
+
+	xfs_iext_insert_raw(ifp, cur, irec);
+	trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+}
+
 static struct xfs_iext_node *
 xfs_iext_rebalance_node(
 	struct xfs_iext_node	*parent,
@@ -777,8 +786,8 @@ again:
 		 * If we are at the root and only one entry is left we can just
 		 * free this node and update the root pointer.
 		 */
-		ASSERT(node == ifp->if_u1.if_root);
-		ifp->if_u1.if_root = node->ptrs[0];
+		ASSERT(node == ifp->if_data);
+		ifp->if_data = node->ptrs[0];
 		ifp->if_height--;
 		kmem_free(node);
 	}
@@ -854,8 +863,8 @@ xfs_iext_free_last_leaf(
 	struct xfs_ifork	*ifp)
 {
 	ifp->if_height--;
-	kmem_free(ifp->if_u1.if_root);
-	ifp->if_u1.if_root = NULL;
+	kmem_free(ifp->if_data);
+	ifp->if_data = NULL;
 }
 
 void
@@ -872,7 +881,7 @@ xfs_iext_remove(
 	trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
 
 	ASSERT(ifp->if_height > 0);
-	ASSERT(ifp->if_u1.if_root != NULL);
+	ASSERT(ifp->if_data != NULL);
 	ASSERT(xfs_iext_valid(ifp, cur));
 
 	xfs_iext_inc_seq(ifp);
@@ -1042,9 +1051,9 @@ void
 xfs_iext_destroy(
 	struct xfs_ifork	*ifp)
 {
-	xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
+	xfs_iext_destroy_node(ifp->if_data, ifp->if_height);
 
 	ifp->if_bytes = 0;
 	ifp->if_height = 0;
-	ifp->if_u1.if_root = NULL;
+	ifp->if_data = NULL;
 }
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 5a2e7ddfa76d..f4569e18a8d0 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -50,12 +50,15 @@ xfs_init_local_fork(
 		mem_size++;
 
 	if (size) {
-		ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
-		memcpy(ifp->if_u1.if_data, data, size);
+		char *new_data = kmem_alloc(mem_size, KM_NOFS);
+
+		memcpy(new_data, data, size);
 		if (zero_terminate)
-			ifp->if_u1.if_data[size] = '\0';
+			new_data[size] = '\0';
+
+		ifp->if_data = new_data;
 	} else {
-		ifp->if_u1.if_data = NULL;
+		ifp->if_data = NULL;
 	}
 
 	ifp->if_bytes = size;
@@ -125,7 +128,7 @@ xfs_iformat_extents(
 	}
 
 	ifp->if_bytes = 0;
-	ifp->if_u1.if_root = NULL;
+	ifp->if_data = NULL;
 	ifp->if_height = 0;
 	if (size) {
 		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
@@ -212,7 +215,7 @@ xfs_iformat_btree(
 			 ifp->if_broot, size);
 
 	ifp->if_bytes = 0;
-	ifp->if_u1.if_root = NULL;
+	ifp->if_data = NULL;
 	ifp->if_height = 0;
 	return 0;
 }
@@ -276,10 +279,9 @@ static uint16_t
 xfs_dfork_attr_shortform_size(
 	struct xfs_dinode		*dip)
 {
-	struct xfs_attr_shortform	*atp =
-		(struct xfs_attr_shortform *)XFS_DFORK_APTR(dip);
+	struct xfs_attr_sf_hdr		*sf = XFS_DFORK_APTR(dip);
 
-	return be16_to_cpu(atp->hdr.totsize);
+	return be16_to_cpu(sf->totsize);
 }
 
 void
@@ -493,7 +495,7 @@ xfs_iroot_realloc(
  * byte_diff -- the change in the number of bytes, positive or negative,
  *	 requested for the if_data array.
  */
-void
+void *
 xfs_idata_realloc(
 	struct xfs_inode	*ip,
 	int64_t			byte_diff,
@@ -505,21 +507,18 @@ xfs_idata_realloc(
 	ASSERT(new_size >= 0);
 	ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork));
 
-	if (byte_diff == 0)
-		return;
-
-	if (new_size == 0) {
-		kmem_free(ifp->if_u1.if_data);
-		ifp->if_u1.if_data = NULL;
-		ifp->if_bytes = 0;
-		return;
+	if (byte_diff) {
+		ifp->if_data = krealloc(ifp->if_data, new_size,
+					GFP_NOFS | __GFP_NOFAIL);
+		if (new_size == 0)
+			ifp->if_data = NULL;
+		ifp->if_bytes = new_size;
 	}
 
-	ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
-				      GFP_NOFS | __GFP_NOFAIL);
-	ifp->if_bytes = new_size;
+	return ifp->if_data;
 }
 
+/* Free all memory and reset a fork back to its initial state. */
 void
 xfs_idestroy_fork(
 	struct xfs_ifork	*ifp)
@@ -531,8 +530,8 @@ xfs_idestroy_fork(
 
 	switch (ifp->if_format) {
 	case XFS_DINODE_FMT_LOCAL:
-		kmem_free(ifp->if_u1.if_data);
-		ifp->if_u1.if_data = NULL;
+		kmem_free(ifp->if_data);
+		ifp->if_data = NULL;
 		break;
 	case XFS_DINODE_FMT_EXTENTS:
 	case XFS_DINODE_FMT_BTREE:
@@ -625,9 +624,9 @@ xfs_iflush_fork(
 	case XFS_DINODE_FMT_LOCAL:
 		if ((iip->ili_fields & dataflag[whichfork]) &&
 		    (ifp->if_bytes > 0)) {
-			ASSERT(ifp->if_u1.if_data != NULL);
+			ASSERT(ifp->if_data != NULL);
 			ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork));
-			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+			memcpy(cp, ifp->if_data, ifp->if_bytes);
 		}
 		break;
 
@@ -702,19 +701,27 @@ xfs_ifork_verify_local_data(
 	xfs_failaddr_t		fa = NULL;
 
 	switch (VFS_I(ip)->i_mode & S_IFMT) {
-	case S_IFDIR:
-		fa = xfs_dir2_sf_verify(ip);
+	case S_IFDIR: {
+		struct xfs_mount	*mp = ip->i_mount;
+		struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+		struct xfs_dir2_sf_hdr	*sfp = ifp->if_data;
+
+		fa = xfs_dir2_sf_verify(mp, sfp, ifp->if_bytes);
 		break;
-	case S_IFLNK:
-		fa = xfs_symlink_shortform_verify(ip);
+	}
+	case S_IFLNK: {
+		struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+		fa = xfs_symlink_shortform_verify(ifp->if_data, ifp->if_bytes);
 		break;
+	}
 	default:
 		break;
 	}
 
 	if (fa) {
 		xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
-				ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa);
+				ip->i_df.if_data, ip->i_df.if_bytes, fa);
 		return -EFSCORRUPTED;
 	}
 
@@ -729,14 +736,17 @@ xfs_ifork_verify_local_attr(
 	struct xfs_ifork	*ifp = &ip->i_af;
 	xfs_failaddr_t		fa;
 
-	if (!xfs_inode_has_attr_fork(ip))
+	if (!xfs_inode_has_attr_fork(ip)) {
 		fa = __this_address;
-	else
-		fa = xfs_attr_shortform_verify(ip);
+	} else {
+		struct xfs_ifork		*ifp = &ip->i_af;
 
+		ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+		fa = xfs_attr_shortform_verify(ifp->if_data, ifp->if_bytes);
+	}
 	if (fa) {
 		xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
-				ifp->if_u1.if_data, ifp->if_bytes, fa);
+				ifp->if_data, ifp->if_bytes, fa);
 		return -EFSCORRUPTED;
 	}
 
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 96d307784c85..96303249d28a 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -13,14 +13,12 @@ struct xfs_dinode;
  * File incore extent information, present for each of data & attr forks.
  */
 struct xfs_ifork {
-	int64_t			if_bytes;	/* bytes in if_u1 */
+	int64_t			if_bytes;	/* bytes in if_data */
 	struct xfs_btree_block	*if_broot;	/* file's incore btree root */
 	unsigned int		if_seq;		/* fork mod counter */
 	int			if_height;	/* height of the extent tree */
-	union {
-		void		*if_root;	/* extent tree root */
-		char		*if_data;	/* inline file data */
-	} if_u1;
+	void			*if_data;	/* extent tree root or
+						   inline data */
 	xfs_extnum_t		if_nextents;	/* # of extents in this fork */
 	short			if_broot_bytes;	/* bytes allocated for root */
 	int8_t			if_format;	/* format of this fork */
@@ -170,7 +168,7 @@ int		xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *);
 void		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
 				struct xfs_inode_log_item *, int);
 void		xfs_idestroy_fork(struct xfs_ifork *ifp);
-void		xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
+void *		xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
 				int whichfork);
 void		xfs_iroot_realloc(struct xfs_inode *, int, int);
 int		xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
@@ -180,6 +178,9 @@ void		xfs_init_local_fork(struct xfs_inode *ip, int whichfork,
 				const void *data, int64_t size);
 
 xfs_extnum_t	xfs_iext_count(struct xfs_ifork *ifp);
+void		xfs_iext_insert_raw(struct xfs_ifork *ifp,
+			struct xfs_iext_cursor *cur,
+			struct xfs_bmbt_irec *irec);
 void		xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur,
 			struct xfs_bmbt_irec *, int);
 void		xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *,
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index a5100a11faf9..9fe7a9564bca 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -11,6 +11,7 @@
  * define how recovery should work for that type of log item.
  */
 struct xlog_recover_item;
+struct xfs_defer_op_type;
 
 /* Sorting hat for log items as they're read in. */
 enum xlog_recover_reorder {
@@ -153,4 +154,11 @@ xlog_recover_resv(const struct xfs_trans_res *r)
 	return ret;
 }
 
+struct xfs_defer_pending;
+
+void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip,
+		xfs_lsn_t lsn, const struct xfs_defer_op_type *ops);
+int xlog_recover_finish_intent(struct xfs_trans *tp,
+		struct xfs_defer_pending *dfp);
+
 #endif	/* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 21a7e350b4c5..81885a6a028e 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -7,16 +7,16 @@
 #define __XFS_ONDISK_H
 
 #define XFS_CHECK_STRUCT_SIZE(structname, size) \
-	BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
-		#structname ") is wrong, expected " #size)
+	static_assert(sizeof(structname) == (size), \
+		"XFS: sizeof(" #structname ") is wrong, expected " #size)
 
 #define XFS_CHECK_OFFSET(structname, member, off) \
-	BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+	static_assert(offsetof(structname, member) == (off), \
 		"XFS: offsetof(" #structname ", " #member ") is wrong, " \
 		"expected " #off)
 
 #define XFS_CHECK_VALUE(value, expected) \
-	BUILD_BUG_ON_MSG((value) != (expected), \
+	static_assert((value) == (expected), \
 		"XFS: value of " #value " is wrong, expected " #expected)
 
 static inline void __init
@@ -93,13 +93,13 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen,	8);
 	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name,	9);
 	XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t,		32);
-	XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform,	4);
-	XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
-	XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count,	 2);
-	XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen,	4);
-	XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen,	5);
-	XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags,	6);
-	XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval,	7);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr,		4);
+	XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize,	0);
+	XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count,		2);
+	XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, namelen,	0);
+	XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen,	1);
+	XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags,	2);
+	XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval,	3);
 	XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t,			12);
 	XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t,			16);
 	XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t,		8);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 646b3fa362ad..6709a7f8bad5 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -123,11 +123,9 @@ xfs_refcount_btrec_to_irec(
 /* Simple checks for refcount records. */
 xfs_failaddr_t
 xfs_refcount_check_irec(
-	struct xfs_btree_cur		*cur,
+	struct xfs_perag		*pag,
 	const struct xfs_refcount_irec	*irec)
 {
-	struct xfs_perag		*pag = cur->bc_ag.pag;
-
 	if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
 		return __this_address;
 
@@ -179,7 +177,7 @@ xfs_refcount_get_rec(
 		return error;
 
 	xfs_refcount_btrec_to_irec(rec, irec);
-	fa = xfs_refcount_check_irec(cur, irec);
+	fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec);
 	if (fa)
 		return xfs_refcount_complain_bad_rec(cur, fa, irec);
 
@@ -1153,7 +1151,7 @@ xfs_refcount_adjust_extents(
 						tmp.rc_startblock);
 				error = xfs_free_extent_later(cur->bc_tp, fsbno,
 						  tmp.rc_blockcount, NULL,
-						  XFS_AG_RESV_NONE);
+						  XFS_AG_RESV_NONE, false);
 				if (error)
 					goto out_error;
 			}
@@ -1215,7 +1213,7 @@ xfs_refcount_adjust_extents(
 					ext.rc_startblock);
 			error = xfs_free_extent_later(cur->bc_tp, fsbno,
 					ext.rc_blockcount, NULL,
-					XFS_AG_RESV_NONE);
+					XFS_AG_RESV_NONE, false);
 			if (error)
 				goto out_error;
 		}
@@ -1458,7 +1456,7 @@ __xfs_refcount_add(
 	ri->ri_blockcount = blockcount;
 
 	xfs_refcount_update_get_group(tp->t_mountp, ri);
-	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
+	xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
 }
 
 /*
@@ -1899,7 +1897,7 @@ xfs_refcount_recover_extent(
 	INIT_LIST_HEAD(&rr->rr_list);
 	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
 
-	if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL ||
+	if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
 	    XFS_IS_CORRUPT(cur->bc_mp,
 			   rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
 		kfree(rr);
@@ -1985,7 +1983,7 @@ xfs_refcount_recover_cow_leftovers(
 		/* Free the block. */
 		error = xfs_free_extent_later(tp, fsb,
 				rr->rr_rrec.rc_blockcount, NULL,
-				XFS_AG_RESV_NONE);
+				XFS_AG_RESV_NONE, false);
 		if (error)
 			goto out_trans;
 
@@ -2033,6 +2031,47 @@ xfs_refcount_has_records(
 	return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
 }
 
+struct xfs_refcount_query_range_info {
+	xfs_refcount_query_range_fn	fn;
+	void				*priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_refcount_query_range_helper(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*rec,
+	void				*priv)
+{
+	struct xfs_refcount_query_range_info	*query = priv;
+	struct xfs_refcount_irec	irec;
+	xfs_failaddr_t			fa;
+
+	xfs_refcount_btrec_to_irec(rec, &irec);
+	fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec);
+	if (fa)
+		return xfs_refcount_complain_bad_rec(cur, fa, &irec);
+
+	return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all refcount records between two keys. */
+int
+xfs_refcount_query_range(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_refcount_irec	*low_rec,
+	const struct xfs_refcount_irec	*high_rec,
+	xfs_refcount_query_range_fn	fn,
+	void				*priv)
+{
+	union xfs_btree_irec		low_brec = { .rc = *low_rec };
+	union xfs_btree_irec		high_brec = { .rc = *high_rec };
+	struct xfs_refcount_query_range_info query = { .priv = priv, .fn = fn };
+
+	return xfs_btree_query_range(cur, &low_brec, &high_brec,
+			xfs_refcount_query_range_helper, &query);
+}
+
 int __init
 xfs_refcount_intent_init_cache(void)
 {
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 783cd89ca195..9b56768a590c 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -117,7 +117,7 @@ extern int xfs_refcount_has_records(struct xfs_btree_cur *cur,
 union xfs_btree_rec;
 extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_refcount_irec *irec);
-xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
 		const struct xfs_refcount_irec *irec);
 extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
 		struct xfs_refcount_irec *irec, int *stat);
@@ -127,4 +127,14 @@ extern struct kmem_cache	*xfs_refcount_intent_cache;
 int __init xfs_refcount_intent_init_cache(void);
 void xfs_refcount_intent_destroy_cache(void);
 
+typedef int (*xfs_refcount_query_range_fn)(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_refcount_irec	*rec,
+	void				*priv);
+
+int xfs_refcount_query_range(struct xfs_btree_cur *cur,
+		const struct xfs_refcount_irec *low_rec,
+		const struct xfs_refcount_irec *high_rec,
+		xfs_refcount_query_range_fn fn, void *priv);
+
 #endif	/* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 5c3987d8dc24..0d80bd99147c 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -112,7 +112,7 @@ xfs_refcountbt_free_block(
 	be32_add_cpu(&agf->agf_refcount_blocks, -1);
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
 	return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
-			&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
+			&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false);
 }
 
 STATIC int
@@ -226,7 +226,18 @@ xfs_refcountbt_verify(
 
 	level = be16_to_cpu(block->bb_level);
 	if (pag && xfs_perag_initialised_agf(pag)) {
-		if (level >= pag->pagf_refcount_level)
+		unsigned int	maxlevel = pag->pagf_refcount_level;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+		/*
+		 * Online repair could be rewriting the refcount btree, so
+		 * we'll validate against the larger of either tree while this
+		 * is going on.
+		 */
+		maxlevel = max_t(unsigned int, maxlevel,
+				pag->pagf_repair_refcount_level);
+#endif
+		if (level >= maxlevel)
 			return __this_address;
 	} else if (level >= mp->m_refc_maxlevels)
 		return __this_address;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index fbb0b2637463..76bf7f48cb5a 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2567,7 +2567,7 @@ __xfs_rmap_add(
 	ri->ri_bmap = *bmap;
 
 	xfs_rmap_update_get_group(tp->t_mountp, ri);
-	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
+	xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
 }
 
 /* Map an extent into a file. */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index c269d704314d..31100120b2c5 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -184,7 +184,7 @@ xfs_rtfind_back(
 		 * Calculate first (leftmost) bit number to look at,
 		 * and mask for all the relevant bits in this word.
 		 */
-		firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+		firstbit = max_t(xfs_srtblock_t, bit - len + 1, 0);
 		mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
 			firstbit;
 		/*
@@ -195,7 +195,7 @@ xfs_rtfind_back(
 			/*
 			 * Different.  Mark where we are and return.
 			 */
-			i = bit - XFS_RTHIBIT(wdiff);
+			i = bit - xfs_highbit32(wdiff);
 			*rtx = start - i + 1;
 			return 0;
 		}
@@ -233,7 +233,7 @@ xfs_rtfind_back(
 			/*
 			 * Different, mark where we are and return.
 			 */
-			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			i += XFS_NBWORD - 1 - xfs_highbit32(wdiff);
 			*rtx = start - i + 1;
 			return 0;
 		}
@@ -272,7 +272,7 @@ xfs_rtfind_back(
 			/*
 			 * Different, mark where we are and return.
 			 */
-			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			i += XFS_NBWORD - 1 - xfs_highbit32(wdiff);
 			*rtx = start - i + 1;
 			return 0;
 		} else
@@ -338,7 +338,7 @@ xfs_rtfind_forw(
 		 * Calculate last (rightmost) bit number to look at,
 		 * and mask for all the relevant bits in this word.
 		 */
-		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		lastbit = min(bit + len, XFS_NBWORD);
 		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
 		/*
 		 * Calculate the difference between the value there
@@ -348,7 +348,7 @@ xfs_rtfind_forw(
 			/*
 			 * Different.  Mark where we are and return.
 			 */
-			i = XFS_RTLOBIT(wdiff) - bit;
+			i = xfs_lowbit32(wdiff) - bit;
 			*rtx = start + i - 1;
 			return 0;
 		}
@@ -386,7 +386,7 @@ xfs_rtfind_forw(
 			/*
 			 * Different, mark where we are and return.
 			 */
-			i += XFS_RTLOBIT(wdiff);
+			i += xfs_lowbit32(wdiff);
 			*rtx = start + i - 1;
 			return 0;
 		}
@@ -423,7 +423,7 @@ xfs_rtfind_forw(
 			/*
 			 * Different, mark where we are and return.
 			 */
-			i += XFS_RTLOBIT(wdiff);
+			i += xfs_lowbit32(wdiff);
 			*rtx = start + i - 1;
 			return 0;
 		} else
@@ -452,71 +452,59 @@ xfs_trans_log_rtsummary(
 }
 
 /*
- * Read and/or modify the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- *
- * Summary information is returned in *sum if specified.
- * If no delta is specified, returns summary only.
+ * Modify the summary information for a given extent size, bitmap block
+ * combination.
  */
 int
-xfs_rtmodify_summary_int(
+xfs_rtmodify_summary(
 	struct xfs_rtalloc_args	*args,
 	int			log,	/* log2 of extent size */
 	xfs_fileoff_t		bbno,	/* bitmap block number */
-	int			delta,	/* change to make to summary info */
-	xfs_suminfo_t		*sum)	/* out: summary info for this block */
+	int			delta)	/* in/out: summary block number */
 {
 	struct xfs_mount	*mp = args->mp;
-	int			error;
-	xfs_fileoff_t		sb;	/* summary fsblock */
-	xfs_rtsumoff_t		so;	/* index into the summary file */
+	xfs_rtsumoff_t		so = xfs_rtsumoffs(mp, log, bbno);
 	unsigned int		infoword;
+	xfs_suminfo_t		val;
+	int			error;
 
-	/*
-	 * Compute entry number in the summary file.
-	 */
-	so = xfs_rtsumoffs(mp, log, bbno);
-	/*
-	 * Compute the block number in the summary file.
-	 */
-	sb = xfs_rtsumoffs_to_block(mp, so);
-
-	error = xfs_rtsummary_read_buf(args, sb);
+	error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so));
 	if (error)
 		return error;
 
-	/*
-	 * Point to the summary information, modify/log it, and/or copy it out.
-	 */
 	infoword = xfs_rtsumoffs_to_infoword(mp, so);
-	if (delta) {
-		xfs_suminfo_t	val = xfs_suminfo_add(args, infoword, delta);
-
-		if (mp->m_rsum_cache) {
-			if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
-				mp->m_rsum_cache[bbno] = log;
-			if (val != 0 && log >= mp->m_rsum_cache[bbno])
-				mp->m_rsum_cache[bbno] = log + 1;
-		}
-		xfs_trans_log_rtsummary(args, infoword);
-		if (sum)
-			*sum = val;
-	} else if (sum) {
-		*sum = xfs_suminfo_get(args, infoword);
+	val = xfs_suminfo_add(args, infoword, delta);
+
+	if (mp->m_rsum_cache) {
+		if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
+			mp->m_rsum_cache[bbno] = log;
+		if (val != 0 && log >= mp->m_rsum_cache[bbno])
+			mp->m_rsum_cache[bbno] = log + 1;
 	}
+
+	xfs_trans_log_rtsummary(args, infoword);
 	return 0;
 }
 
+/*
+ * Read and return the summary information for a given extent size, bitmap block
+ * combination.
+ */
 int
-xfs_rtmodify_summary(
+xfs_rtget_summary(
 	struct xfs_rtalloc_args	*args,
 	int			log,	/* log2 of extent size */
 	xfs_fileoff_t		bbno,	/* bitmap block number */
-	int			delta)	/* in/out: summary block number */
+	xfs_suminfo_t		*sum)	/* out: summary info for this block */
 {
-	return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL);
+	struct xfs_mount	*mp = args->mp;
+	xfs_rtsumoff_t		so = xfs_rtsumoffs(mp, log, bbno);
+	int			error;
+
+	error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so));
+	if (!error)
+		*sum = xfs_suminfo_get(args, xfs_rtsumoffs_to_infoword(mp, so));
+	return error;
 }
 
 /* Log rtbitmap block from the word @from to the byte before @next. */
@@ -585,7 +573,7 @@ xfs_rtmodify_range(
 		/*
 		 * Compute first bit not changed and mask of relevant bits.
 		 */
-		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		lastbit = min(bit + len, XFS_NBWORD);
 		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
 		/*
 		 * Set/clear the active bits.
@@ -720,7 +708,7 @@ xfs_rtfree_range(
 	 */
 	if (preblock < start) {
 		error = xfs_rtmodify_summary(args,
-				XFS_RTBLOCKLOG(start - preblock),
+				xfs_highbit64(start - preblock),
 				xfs_rtx_to_rbmblock(mp, preblock), -1);
 		if (error) {
 			return error;
@@ -732,7 +720,7 @@ xfs_rtfree_range(
 	 */
 	if (postblock > end) {
 		error = xfs_rtmodify_summary(args,
-				XFS_RTBLOCKLOG(postblock - end),
+				xfs_highbit64(postblock - end),
 				xfs_rtx_to_rbmblock(mp, end + 1), -1);
 		if (error) {
 			return error;
@@ -743,7 +731,7 @@ xfs_rtfree_range(
 	 * (new) free extent.
 	 */
 	return xfs_rtmodify_summary(args,
-			XFS_RTBLOCKLOG(postblock + 1 - preblock),
+			xfs_highbit64(postblock + 1 - preblock),
 			xfs_rtx_to_rbmblock(mp, preblock), 1);
 }
 
@@ -799,7 +787,7 @@ xfs_rtcheck_range(
 		/*
 		 * Compute first bit not examined.
 		 */
-		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		lastbit = min(bit + len, XFS_NBWORD);
 		/*
 		 * Mask of relevant bits.
 		 */
@@ -812,7 +800,7 @@ xfs_rtcheck_range(
 			/*
 			 * Different, compute first wrong bit and return.
 			 */
-			i = XFS_RTLOBIT(wdiff) - bit;
+			i = xfs_lowbit32(wdiff) - bit;
 			*new = start + i;
 			*stat = 0;
 			return 0;
@@ -851,7 +839,7 @@ xfs_rtcheck_range(
 			/*
 			 * Different, compute first wrong bit and return.
 			 */
-			i += XFS_RTLOBIT(wdiff);
+			i += xfs_lowbit32(wdiff);
 			*new = start + i;
 			*stat = 0;
 			return 0;
@@ -889,7 +877,7 @@ xfs_rtcheck_range(
 			/*
 			 * Different, compute first wrong bit and return.
 			 */
-			i += XFS_RTLOBIT(wdiff);
+			i += xfs_lowbit32(wdiff);
 			*new = start + i;
 			*stat = 0;
 			return 0;
@@ -1131,6 +1119,20 @@ xfs_rtbitmap_blockcount(
 }
 
 /*
+ * Compute the maximum level number of the realtime summary file, as defined by
+ * mkfs.  The historic use of highbit32 on a 64-bit quantity prohibited correct
+ * use of rt volumes with more than 2^32 extents.
+ */
+uint8_t
+xfs_compute_rextslog(
+	xfs_rtbxlen_t		rtextents)
+{
+	if (!rtextents)
+		return 0;
+	return xfs_highbit64(rtextents);
+}
+
+/*
  * Compute the number of rtbitmap words needed to populate every block of a
  * bitmap that is large enough to track the given number of rt extents.
  */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index c0637057d69c..274dc7dae1fa 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -321,8 +321,8 @@ int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
 		xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
 int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
 		xfs_rtxlen_t len, int val);
-int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log,
-		xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum);
+int xfs_rtget_summary(struct xfs_rtalloc_args *args, int log,
+		xfs_fileoff_t bbno, xfs_suminfo_t *sum);
 int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log,
 		xfs_fileoff_t bbno, int delta);
 int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
@@ -351,6 +351,20 @@ xfs_rtfree_extent(
 int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
 		xfs_filblks_t rtlen);
 
+uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+
+/* Do we support an rt volume having this number of rtextents? */
+static inline bool
+xfs_validate_rtextents(
+	xfs_rtbxlen_t		rtextents)
+{
+	/* No runt rt volumes */
+	if (rtextents == 0)
+		return false;
+
+	return true;
+}
+
 xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
 		rtextents);
 unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
@@ -369,6 +383,8 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
 # define xfs_rtsummary_read_buf(a,b)			(-ENOSYS)
 # define xfs_rtbuf_cache_relse(a)			(0)
 # define xfs_rtalloc_extent_is_free(m,t,s,l,i)		(-ENOSYS)
+# define xfs_compute_rextslog(rtx)			(0)
+# define xfs_validate_rtextents(rtx)			(false)
 static inline xfs_filblks_t
 xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
 {
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 1f74d0cd1618..4a9e8588f4c9 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -25,6 +25,7 @@
 #include "xfs_da_format.h"
 #include "xfs_health.h"
 #include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -508,8 +509,9 @@ xfs_validate_sb_common(
 		rbmblocks = howmany_64(sbp->sb_rextents,
 				       NBBY * sbp->sb_blocksize);
 
-		if (sbp->sb_rextents != rexts ||
-		    sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+		if (!xfs_validate_rtextents(rexts) ||
+		    sbp->sb_rextents != rexts ||
+		    sbp->sb_rextslog != xfs_compute_rextslog(rexts) ||
 		    sbp->sb_rbmblocks != rbmblocks) {
 			xfs_notice(mp,
 				"realtime geometry sanity check failed");
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index c4381388c0c1..4220d3584c1b 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -139,7 +139,7 @@ bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
 			uint32_t size, struct xfs_buf *bp);
 void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
 				 struct xfs_inode *ip, struct xfs_ifork *ifp);
-xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
 
 /* Computed inode geometry for the filesystem. */
 struct xfs_ino_geometry {
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index bdc777b9ec4a..160aa20aa441 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -175,7 +175,7 @@ xfs_symlink_local_to_remote(
 
 	if (!xfs_has_crc(mp)) {
 		bp->b_ops = NULL;
-		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+		memcpy(bp->b_addr, ifp->if_data, ifp->if_bytes);
 		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
 		return;
 	}
@@ -191,7 +191,7 @@ xfs_symlink_local_to_remote(
 
 	buf = bp->b_addr;
 	buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
-	memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+	memcpy(buf, ifp->if_data, ifp->if_bytes);
 	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
 					ifp->if_bytes - 1);
 }
@@ -202,15 +202,11 @@ xfs_symlink_local_to_remote(
  */
 xfs_failaddr_t
 xfs_symlink_shortform_verify(
-	struct xfs_inode	*ip)
+	void			*sfp,
+	int64_t			size)
 {
-	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
-	char			*sfp = (char *)ifp->if_u1.if_data;
-	int			size = ifp->if_bytes;
 	char			*endp = sfp + size;
 
-	ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
-
 	/*
 	 * Zero length symlinks should never occur in memory as they are
 	 * never allowed to exist on disk.
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 533200c4ccc2..20b5375f2d9c 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -51,7 +51,6 @@ typedef void *		xfs_failaddr_t;
 #define	NULLRFSBLOCK	((xfs_rfsblock_t)-1)
 #define	NULLRTBLOCK	((xfs_rtblock_t)-1)
 #define	NULLFILEOFF	((xfs_fileoff_t)-1)
-#define	NULLRTEXTNO	((xfs_rtxnum_t)-1)
 
 #define	NULLAGBLOCK	((xfs_agblock_t)-1)
 #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
@@ -208,6 +207,13 @@ enum xfs_ag_resv_type {
 	XFS_AG_RESV_AGFL,
 	XFS_AG_RESV_METADATA,
 	XFS_AG_RESV_RMAPBT,
+
+	/*
+	 * Don't increase fdblocks when freeing extent.  This is a pony for
+	 * the bnobt repair functions to re-free the free space without
+	 * altering fdblocks.  If you think you need this you're wrong.
+	 */
+	XFS_AG_RESV_IGNORE,
 };
 
 /* Results of scanning a btree keyspace to check occupancy. */
diff --git a/fs/xfs/scrub/agb_bitmap.c b/fs/xfs/scrub/agb_bitmap.c
new file mode 100644
index 000000000000..573e4e062754
--- /dev/null
+++ b/fs/xfs/scrub/agb_bitmap.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "bitmap.h"
+#include "scrub/agb_bitmap.h"
+
+/*
+ * Record all btree blocks seen while iterating all records of a btree.
+ *
+ * We know that the btree query_all function starts at the left edge and walks
+ * towards the right edge of the tree.  Therefore, we know that we can walk up
+ * the btree cursor towards the root; if the pointer for a given level points
+ * to the first record/key in that block, we haven't seen this block before;
+ * and therefore we need to remember that we saw this block in the btree.
+ *
+ * So if our btree is:
+ *
+ *    4
+ *  / | \
+ * 1  2  3
+ *
+ * Pretend for this example that each leaf block has 100 btree records.  For
+ * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
+ * record that we saw block 1.  Then we observe that bc_levels[1].ptr == 1, so
+ * we record block 4.  The list is [1, 4].
+ *
+ * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
+ * the loop.  The list remains [1, 4].
+ *
+ * For the 101st btree record, we've moved onto leaf block 2.  Now
+ * bc_levels[0].ptr == 1 again, so we record that we saw block 2.  We see that
+ * bc_levels[1].ptr == 2, so we exit the loop.  The list is now [1, 4, 2].
+ *
+ * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
+ *
+ * For the 201st record, we've moved on to leaf block 3.
+ * bc_levels[0].ptr == 1, so we add 3 to the list.  Now it is [1, 4, 2, 3].
+ *
+ * For the 300th record we just exit, with the list being [1, 4, 2, 3].
+ */
+
+/* Mark a btree block to the agblock bitmap. */
+STATIC int
+xagb_bitmap_visit_btblock(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	void			*priv)
+{
+	struct xagb_bitmap	*bitmap = priv;
+	struct xfs_buf		*bp;
+	xfs_fsblock_t		fsbno;
+	xfs_agblock_t		agbno;
+
+	xfs_btree_get_block(cur, level, &bp);
+	if (!bp)
+		return 0;
+
+	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+	agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+
+	return xagb_bitmap_set(bitmap, agbno, 1);
+}
+
+/* Mark all (per-AG) btree blocks in the agblock bitmap. */
+int
+xagb_bitmap_set_btblocks(
+	struct xagb_bitmap	*bitmap,
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
+			XFS_BTREE_VISIT_ALL, bitmap);
+}
+
+/*
+ * Record all the buffers pointed to by the btree cursor.  Callers already
+ * engaged in a btree walk should call this function to capture the list of
+ * blocks going from the leaf towards the root.
+ */
+int
+xagb_bitmap_set_btcur_path(
+	struct xagb_bitmap	*bitmap,
+	struct xfs_btree_cur	*cur)
+{
+	int			i;
+	int			error;
+
+	for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
+		error = xagb_bitmap_visit_btblock(cur, i, bitmap);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h
new file mode 100644
index 000000000000..ed08f76ff4f3
--- /dev/null
+++ b/fs/xfs/scrub/agb_bitmap.h
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_AGB_BITMAP_H__
+#define __XFS_SCRUB_AGB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_agblock_t */
+
+struct xagb_bitmap {
+	struct xbitmap32	agbitmap;
+};
+
+static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
+{
+	xbitmap32_init(&bitmap->agbitmap);
+}
+
+static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
+{
+	xbitmap32_destroy(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
+		xfs_agblock_t start, xfs_extlen_t len)
+{
+	return xbitmap32_clear(&bitmap->agbitmap, start, len);
+}
+static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
+		xfs_agblock_t start, xfs_extlen_t len)
+{
+	return xbitmap32_set(&bitmap->agbitmap, start, len);
+}
+
+static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap,
+		xfs_agblock_t start, xfs_extlen_t *len)
+{
+	return xbitmap32_test(&bitmap->agbitmap, start, len);
+}
+
+static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
+		struct xagb_bitmap *sub)
+{
+	return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap);
+}
+
+static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
+{
+	return xbitmap32_hweight(&bitmap->agbitmap);
+}
+static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
+{
+	return xbitmap32_empty(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
+		xbitmap32_walk_fn fn, void *priv)
+{
+	return xbitmap32_walk(&bitmap->agbitmap, fn, priv);
+}
+
+int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
+		struct xfs_btree_cur *cur);
+int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
+		struct xfs_btree_cur *cur);
+
+#endif	/* __XFS_SCRUB_AGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 876a2f41b063..26bd1ff68f1b 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -26,6 +26,7 @@
 #include "scrub/trace.h"
 #include "scrub/repair.h"
 #include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
 #include "scrub/reap.h"
 
 /* Superblock */
@@ -72,7 +73,7 @@ xrep_superblock(
 	/* Write this to disk. */
 	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
 	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
-	return error;
+	return 0;
 }
 
 /* AGF */
@@ -341,7 +342,7 @@ xrep_agf_commit_new(
 	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
 	set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
 
-	return 0;
+	return xrep_roll_ag_trans(sc);
 }
 
 /* Repair the AGF. v5 filesystems only. */
@@ -494,12 +495,11 @@ xrep_agfl_walk_rmap(
 /* Strike out the blocks that are cross-linked according to the rmapbt. */
 STATIC int
 xrep_agfl_check_extent(
-	uint64_t		start,
-	uint64_t		len,
+	uint32_t		agbno,
+	uint32_t		len,
 	void			*priv)
 {
 	struct xrep_agfl	*ra = priv;
-	xfs_agblock_t		agbno = start;
 	xfs_agblock_t		last_agbno = agbno + len - 1;
 	int			error;
 
@@ -647,8 +647,8 @@ struct xrep_agfl_fill {
 /* Fill the AGFL with whatever blocks are in this extent. */
 static int
 xrep_agfl_fill(
-	uint64_t		start,
-	uint64_t		len,
+	uint32_t		start,
+	uint32_t		len,
 	void			*priv)
 {
 	struct xrep_agfl_fill	*af = priv;
@@ -789,6 +789,9 @@ xrep_agfl(
 	/* Dump any AGFL overflow. */
 	error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
 			XFS_AG_RESV_AGFL);
+	if (error)
+		goto err;
+
 err:
 	xagb_bitmap_destroy(&agfl_extents);
 	return error;
@@ -962,7 +965,7 @@ xrep_agi_commit_new(
 	pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
 	set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
 
-	return 0;
+	return xrep_roll_ag_trans(sc);
 }
 
 /* Repair the AGI. */
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 279af72b1671..d1b8a4997dd2 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -9,13 +9,16 @@
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
 #include "xfs_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_rmap.h"
+#include "xfs_ag.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
-#include "xfs_ag.h"
+#include "scrub/repair.h"
 
 /*
  * Set us up to scrub free space btrees.
@@ -24,10 +27,19 @@ int
 xchk_setup_ag_allocbt(
 	struct xfs_scrub	*sc)
 {
+	int			error;
+
 	if (xchk_need_intent_drain(sc))
 		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
 
-	return xchk_setup_ag_btree(sc, false);
+	error = xchk_setup_ag_btree(sc, false);
+	if (error)
+		return error;
+
+	if (xchk_could_repair(sc))
+		return xrep_setup_ag_allocbt(sc);
+
+	return 0;
 }
 
 /* Free space btree scrubber. */
@@ -127,7 +139,7 @@ xchk_allocbt_rec(
 	struct xchk_alloc	*ca = bs->private;
 
 	xfs_alloc_btrec_to_irec(rec, &irec);
-	if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) {
+	if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 		return 0;
 	}
@@ -138,31 +150,27 @@ xchk_allocbt_rec(
 	return 0;
 }
 
-/* Scrub the freespace btrees for some AG. */
-STATIC int
+/* Scrub one of the freespace btrees for some AG. */
+int
 xchk_allocbt(
-	struct xfs_scrub	*sc,
-	xfs_btnum_t		which)
+	struct xfs_scrub	*sc)
 {
 	struct xchk_alloc	ca = { };
 	struct xfs_btree_cur	*cur;
 
-	cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
-	return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
-}
-
-int
-xchk_bnobt(
-	struct xfs_scrub	*sc)
-{
-	return xchk_allocbt(sc, XFS_BTNUM_BNO);
-}
+	switch (sc->sm->sm_type) {
+	case XFS_SCRUB_TYPE_BNOBT:
+		cur = sc->sa.bno_cur;
+		break;
+	case XFS_SCRUB_TYPE_CNTBT:
+		cur = sc->sa.cnt_cur;
+		break;
+	default:
+		ASSERT(0);
+		return -EIO;
+	}
 
-int
-xchk_cntbt(
-	struct xfs_scrub	*sc)
-{
-	return xchk_allocbt(sc, XFS_BTNUM_CNT);
+	return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
 }
 
 /* xref check that the extent is not free */
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
new file mode 100644
index 000000000000..45edda096869
--- /dev/null
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -0,0 +1,934 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_refcount.h"
+#include "xfs_extent_busy.h"
+#include "xfs_health.h"
+#include "xfs_bmap.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Free Space Btree Repair
+ * =======================
+ *
+ * The reverse mappings are supposed to record all space usage for the entire
+ * AG.  Therefore, we can recreate the free extent records in an AG by looking
+ * for gaps in the physical extents recorded in the rmapbt.  These records are
+ * staged in @free_records.  Identifying the gaps is more difficult on a
+ * reflink filesystem because rmap records are allowed to overlap.
+ *
+ * Because the final step of building a new index is to free the space used by
+ * the old index, repair needs to find that space.  Unfortunately, all
+ * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share
+ * the same rmapbt owner code (OWN_AG), so this is not straightforward.
+ *
+ * The scan of the reverse mapping information records the space used by OWN_AG
+ * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed.  While
+ * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to
+ * record all visited rmap btree blocks and all blocks owned by the AGFL.
+ *
+ * After that is where the definitions of old_allocbt_blocks shifts.  This
+ * expression identifies possible former bnobt/cntbt blocks:
+ *
+ *	(OWN_AG blocks) & ~(rmapbt blocks | agfl blocks);
+ *
+ * Substituting from above definitions, that becomes:
+ *
+ *	old_allocbt_blocks & ~not_allocbt_blocks
+ *
+ * The OWN_AG bitmap itself isn't needed after this point, so what we really do
+ * instead is:
+ *
+ *	old_allocbt_blocks &= ~not_allocbt_blocks;
+ *
+ * After this point, @old_allocbt_blocks is a bitmap of alleged former
+ * bnobt/cntbt blocks.  The xagb_bitmap_disunion operation modifies its first
+ * parameter in place to avoid copying records around.
+ *
+ * Next, some of the space described by @free_records are diverted to the newbt
+ * reservation and used to format new btree blocks.  The remaining records are
+ * written to the new btree indices.  We reconstruct both bnobt and cntbt at
+ * the same time since we've already done all the work.
+ *
+ * We use the prefix 'xrep_abt' here because we regenerate both free space
+ * allocation btrees at the same time.
+ */
+
+struct xrep_abt {
+	/* Blocks owned by the rmapbt or the agfl. */
+	struct xagb_bitmap	not_allocbt_blocks;
+
+	/* All OWN_AG blocks. */
+	struct xagb_bitmap	old_allocbt_blocks;
+
+	/*
+	 * New bnobt information.  All btree block reservations are added to
+	 * the reservation list in new_bnobt.
+	 */
+	struct xrep_newbt	new_bnobt;
+
+	/* new cntbt information */
+	struct xrep_newbt	new_cntbt;
+
+	/* Free space extents. */
+	struct xfarray		*free_records;
+
+	struct xfs_scrub	*sc;
+
+	/* Number of non-null records in @free_records. */
+	uint64_t		nr_real_records;
+
+	/* get_records()'s position in the free space record array. */
+	xfarray_idx_t		array_cur;
+
+	/*
+	 * Next block we anticipate seeing in the rmap records.  If the next
+	 * rmap record is greater than next_agbno, we have found unused space.
+	 */
+	xfs_agblock_t		next_agbno;
+
+	/* Number of free blocks in this AG. */
+	xfs_agblock_t		nr_blocks;
+
+	/* Longest free extent we found in the AG. */
+	xfs_agblock_t		longest;
+};
+
+/* Set up to repair AG free space btrees. */
+int
+xrep_setup_ag_allocbt(
+	struct xfs_scrub	*sc)
+{
+	unsigned int		busy_gen;
+
+	/*
+	 * Make sure the busy extent list is clear because we can't put extents
+	 * on there twice.
+	 */
+	busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
+	if (xfs_extent_busy_list_empty(sc->sa.pag))
+		return 0;
+
+	return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
+}
+
+/* Check for any obvious conflicts in the free extent. */
+STATIC int
+xrep_abt_check_free_ext(
+	struct xfs_scrub	*sc,
+	const struct xfs_alloc_rec_incore *rec)
+{
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL)
+		return -EFSCORRUPTED;
+
+	/* Must not be an inode chunk. */
+	error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+			rec->ar_startblock, rec->ar_blockcount, &outcome);
+	if (error)
+		return error;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		return -EFSCORRUPTED;
+
+	/* Must not be shared or CoW staging. */
+	if (sc->sa.refc_cur) {
+		error = xfs_refcount_has_records(sc->sa.refc_cur,
+				XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
+				rec->ar_blockcount, &outcome);
+		if (error)
+			return error;
+		if (outcome != XBTREE_RECPACKING_EMPTY)
+			return -EFSCORRUPTED;
+
+		error = xfs_refcount_has_records(sc->sa.refc_cur,
+				XFS_REFC_DOMAIN_COW, rec->ar_startblock,
+				rec->ar_blockcount, &outcome);
+		if (error)
+			return error;
+		if (outcome != XBTREE_RECPACKING_EMPTY)
+			return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+/*
+ * Stash a free space record for all the space since the last bno we found
+ * all the way up to @end.
+ */
+static int
+xrep_abt_stash(
+	struct xrep_abt		*ra,
+	xfs_agblock_t		end)
+{
+	struct xfs_alloc_rec_incore arec = {
+		.ar_startblock	= ra->next_agbno,
+		.ar_blockcount	= end - ra->next_agbno,
+	};
+	struct xfs_scrub	*sc = ra->sc;
+	int			error = 0;
+
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	error = xrep_abt_check_free_ext(ra->sc, &arec);
+	if (error)
+		return error;
+
+	trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
+
+	error = xfarray_append(ra->free_records, &arec);
+	if (error)
+		return error;
+
+	ra->nr_blocks += arec.ar_blockcount;
+	return 0;
+}
+
+/* Record extents that aren't in use from gaps in the rmap records. */
+STATIC int
+xrep_abt_walk_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_abt			*ra = priv;
+	int				error;
+
+	/* Record all the OWN_AG blocks... */
+	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
+		error = xagb_bitmap_set(&ra->old_allocbt_blocks,
+				rec->rm_startblock, rec->rm_blockcount);
+		if (error)
+			return error;
+	}
+
+	/* ...and all the rmapbt blocks... */
+	error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
+	if (error)
+		return error;
+
+	/* ...and all the free space. */
+	if (rec->rm_startblock > ra->next_agbno) {
+		error = xrep_abt_stash(ra, rec->rm_startblock);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * rmap records can overlap on reflink filesystems, so project
+	 * next_agbno as far out into the AG space as we currently know about.
+	 */
+	ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
+			rec->rm_startblock + rec->rm_blockcount);
+	return 0;
+}
+
+/* Collect an AGFL block for the not-to-release list. */
+static int
+xrep_abt_walk_agfl(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agbno,
+	void			*priv)
+{
+	struct xrep_abt		*ra = priv;
+
+	return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
+}
+
+/*
+ * Compare two free space extents by block number.  We want to sort in order of
+ * increasing block number.
+ */
+static int
+xrep_bnobt_extent_cmp(
+	const void		*a,
+	const void		*b)
+{
+	const struct xfs_alloc_rec_incore *ap = a;
+	const struct xfs_alloc_rec_incore *bp = b;
+
+	if (ap->ar_startblock > bp->ar_startblock)
+		return 1;
+	else if (ap->ar_startblock < bp->ar_startblock)
+		return -1;
+	return 0;
+}
+
+/*
+ * Re-sort the free extents by block number so that we can put the records into
+ * the bnobt in the correct order.  Make sure the records do not overlap in
+ * physical space.
+ */
+STATIC int
+xrep_bnobt_sort_records(
+	struct xrep_abt			*ra)
+{
+	struct xfs_alloc_rec_incore	arec;
+	xfarray_idx_t			cur = XFARRAY_CURSOR_INIT;
+	xfs_agblock_t			next_agbno = 0;
+	int				error;
+
+	error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
+	if (error)
+		return error;
+
+	while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
+		if (arec.ar_startblock < next_agbno)
+			return -EFSCORRUPTED;
+
+		next_agbno = arec.ar_startblock + arec.ar_blockcount;
+	}
+
+	return error;
+}
+
+/*
+ * Compare two free space extents by length and then block number.  We want
+ * to sort first in order of increasing length and then in order of increasing
+ * block number.
+ */
+static int
+xrep_cntbt_extent_cmp(
+	const void			*a,
+	const void			*b)
+{
+	const struct xfs_alloc_rec_incore *ap = a;
+	const struct xfs_alloc_rec_incore *bp = b;
+
+	if (ap->ar_blockcount > bp->ar_blockcount)
+		return 1;
+	else if (ap->ar_blockcount < bp->ar_blockcount)
+		return -1;
+	return xrep_bnobt_extent_cmp(a, b);
+}
+
+/*
+ * Sort the free extents by length so so that we can put the records into the
+ * cntbt in the correct order.  Don't let userspace kill us if we're resorting
+ * after allocating btree blocks.
+ */
+STATIC int
+xrep_cntbt_sort_records(
+	struct xrep_abt			*ra,
+	bool				is_resort)
+{
+	return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
+			is_resort ? 0 : XFARRAY_SORT_KILLABLE);
+}
+
+/*
+ * Iterate all reverse mappings to find (1) the gaps between rmap records (all
+ * unowned space), (2) the OWN_AG extents (which encompass the free space
+ * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
+ * blocks.  The free space is (1) + (2) - (3) - (4).
+ */
+STATIC int
+xrep_abt_find_freespace(
+	struct xrep_abt		*ra)
+{
+	struct xfs_scrub	*sc = ra->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	struct xfs_buf		*agfl_bp;
+	xfs_agblock_t		agend;
+	int			error;
+
+	xagb_bitmap_init(&ra->not_allocbt_blocks);
+
+	xrep_ag_btcur_init(sc, &sc->sa);
+
+	/*
+	 * Iterate all the reverse mappings to find gaps in the physical
+	 * mappings, all the OWN_AG blocks, and all the rmapbt extents.
+	 */
+	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
+	if (error)
+		goto err;
+
+	/* Insert a record for space between the last rmap and EOAG. */
+	agend = be32_to_cpu(agf->agf_length);
+	if (ra->next_agbno < agend) {
+		error = xrep_abt_stash(ra, agend);
+		if (error)
+			goto err;
+	}
+
+	/* Collect all the AGFL blocks. */
+	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+	if (error)
+		goto err;
+
+	error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
+	if (error)
+		goto err_agfl;
+
+	/* Compute the old bnobt/cntbt blocks. */
+	error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
+			&ra->not_allocbt_blocks);
+	if (error)
+		goto err_agfl;
+
+	ra->nr_real_records = xfarray_length(ra->free_records);
+err_agfl:
+	xfs_trans_brelse(sc->tp, agfl_bp);
+err:
+	xchk_ag_btcur_free(&sc->sa);
+	xagb_bitmap_destroy(&ra->not_allocbt_blocks);
+	return error;
+}
+
+/*
+ * We're going to use the observed free space records to reserve blocks for the
+ * new free space btrees, so we play an iterative game where we try to converge
+ * on the number of blocks we need:
+ *
+ * 1. Estimate how many blocks we'll need to store the records.
+ * 2. If the first free record has more blocks than we need, we're done.
+ *    We will have to re-sort the records prior to building the cntbt.
+ * 3. If that record has exactly the number of blocks we need, null out the
+ *    record.  We're done.
+ * 4. Otherwise, we still need more blocks.  Null out the record, subtract its
+ *    length from the number of blocks we need, and go back to step 1.
+ *
+ * Fortunately, we don't have to do any transaction work to play this game, so
+ * we don't have to tear down the staging cursors.
+ */
+STATIC int
+xrep_abt_reserve_space(
+	struct xrep_abt		*ra,
+	struct xfs_btree_cur	*bno_cur,
+	struct xfs_btree_cur	*cnt_cur,
+	bool			*needs_resort)
+{
+	struct xfs_scrub	*sc = ra->sc;
+	xfarray_idx_t		record_nr;
+	unsigned int		allocated = 0;
+	int			error = 0;
+
+	record_nr = xfarray_length(ra->free_records) - 1;
+	do {
+		struct xfs_alloc_rec_incore arec;
+		uint64_t		required;
+		unsigned int		desired;
+		unsigned int		len;
+
+		/* Compute how many blocks we'll need. */
+		error = xfs_btree_bload_compute_geometry(cnt_cur,
+				&ra->new_cntbt.bload, ra->nr_real_records);
+		if (error)
+			break;
+
+		error = xfs_btree_bload_compute_geometry(bno_cur,
+				&ra->new_bnobt.bload, ra->nr_real_records);
+		if (error)
+			break;
+
+		/* How many btree blocks do we need to store all records? */
+		required = ra->new_bnobt.bload.nr_blocks +
+			   ra->new_cntbt.bload.nr_blocks;
+		ASSERT(required < INT_MAX);
+
+		/* If we've reserved enough blocks, we're done. */
+		if (allocated >= required)
+			break;
+
+		desired = required - allocated;
+
+		/* We need space but there's none left; bye! */
+		if (ra->nr_real_records == 0) {
+			error = -ENOSPC;
+			break;
+		}
+
+		/* Grab the first record from the list. */
+		error = xfarray_load(ra->free_records, record_nr, &arec);
+		if (error)
+			break;
+
+		ASSERT(arec.ar_blockcount <= UINT_MAX);
+		len = min_t(unsigned int, arec.ar_blockcount, desired);
+
+		trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
+				arec.ar_startblock, len, XFS_RMAP_OWN_AG);
+
+		error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
+				arec.ar_startblock, len);
+		if (error)
+			break;
+		allocated += len;
+		ra->nr_blocks -= len;
+
+		if (arec.ar_blockcount > desired) {
+			/*
+			 * Record has more space than we need.  The number of
+			 * free records doesn't change, so shrink the free
+			 * record, inform the caller that the records are no
+			 * longer sorted by length, and exit.
+			 */
+			arec.ar_startblock += desired;
+			arec.ar_blockcount -= desired;
+			error = xfarray_store(ra->free_records, record_nr,
+					&arec);
+			if (error)
+				break;
+
+			*needs_resort = true;
+			return 0;
+		}
+
+		/*
+		 * We're going to use up the entire record, so unset it and
+		 * move on to the next one.  This changes the number of free
+		 * records (but doesn't break the sorting order), so we must
+		 * go around the loop once more to re-run _bload_init.
+		 */
+		error = xfarray_unset(ra->free_records, record_nr);
+		if (error)
+			break;
+		ra->nr_real_records--;
+		record_nr--;
+	} while (1);
+
+	return error;
+}
+
+STATIC int
+xrep_abt_dispose_one(
+	struct xrep_abt		*ra,
+	struct xrep_newbt_resv	*resv)
+{
+	struct xfs_scrub	*sc = ra->sc;
+	struct xfs_perag	*pag = sc->sa.pag;
+	xfs_agblock_t		free_agbno = resv->agbno + resv->used;
+	xfs_extlen_t		free_aglen = resv->len - resv->used;
+	int			error;
+
+	ASSERT(pag == resv->pag);
+
+	/* Add a deferred rmap for each extent we used. */
+	if (resv->used > 0)
+		xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
+				resv->used, XFS_RMAP_OWN_AG);
+
+	/*
+	 * For each reserved btree block we didn't use, add it to the free
+	 * space btree.  We didn't touch fdblocks when we reserved them, so
+	 * we don't touch it now.
+	 */
+	if (free_aglen == 0)
+		return 0;
+
+	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
+			free_aglen, ra->new_bnobt.oinfo.oi_owner);
+
+	error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
+			&ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
+	if (error)
+		return error;
+
+	return xrep_defer_finish(sc);
+}
+
+/*
+ * Deal with all the space we reserved.  Blocks that were allocated for the
+ * free space btrees need to have a (deferred) rmap added for the OWN_AG
+ * allocation, and blocks that didn't get used can be freed via the usual
+ * (deferred) means.
+ */
+STATIC void
+xrep_abt_dispose_reservations(
+	struct xrep_abt		*ra,
+	int			error)
+{
+	struct xrep_newbt_resv	*resv, *n;
+
+	if (error)
+		goto junkit;
+
+	list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
+		error = xrep_abt_dispose_one(ra, resv);
+		if (error)
+			goto junkit;
+	}
+
+junkit:
+	list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
+		xfs_perag_put(resv->pag);
+		list_del(&resv->list);
+		kfree(resv);
+	}
+
+	xrep_newbt_cancel(&ra->new_bnobt);
+	xrep_newbt_cancel(&ra->new_cntbt);
+}
+
+/* Retrieve free space data for bulk load. */
+STATIC int
+xrep_abt_get_records(
+	struct xfs_btree_cur		*cur,
+	unsigned int			idx,
+	struct xfs_btree_block		*block,
+	unsigned int			nr_wanted,
+	void				*priv)
+{
+	struct xfs_alloc_rec_incore	*arec = &cur->bc_rec.a;
+	struct xrep_abt			*ra = priv;
+	union xfs_btree_rec		*block_rec;
+	unsigned int			loaded;
+	int				error;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		error = xfarray_load_next(ra->free_records, &ra->array_cur,
+				arec);
+		if (error)
+			return error;
+
+		ra->longest = max(ra->longest, arec->ar_blockcount);
+
+		block_rec = xfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_abt_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_abt		*ra = priv;
+
+	return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
+}
+
+/*
+ * Reset the AGF counters to reflect the free space btrees that we just
+ * rebuilt, then reinitialize the per-AG data.
+ */
+STATIC int
+xrep_abt_reset_counters(
+	struct xrep_abt		*ra)
+{
+	struct xfs_scrub	*sc = ra->sc;
+	struct xfs_perag	*pag = sc->sa.pag;
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	unsigned int		freesp_btreeblks = 0;
+
+	/*
+	 * Compute the contribution to agf_btreeblks for the new free space
+	 * btrees.  This is the computed btree size minus anything we didn't
+	 * use.
+	 */
+	freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
+	freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
+
+	freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
+	freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
+
+	/*
+	 * The AGF header contains extra information related to the free space
+	 * btrees, so we must update those fields here.
+	 */
+	agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
+				(be32_to_cpu(agf->agf_rmap_blocks) - 1));
+	agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
+	agf->agf_longest = cpu_to_be32(ra->longest);
+	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
+						 XFS_AGF_LONGEST |
+						 XFS_AGF_FREEBLKS);
+
+	/*
+	 * After we commit the new btree to disk, it is possible that the
+	 * process to reap the old btree blocks will race with the AIL trying
+	 * to checkpoint the old btree blocks into the filesystem.  If the new
+	 * tree is shorter than the old one, the allocbt write verifier will
+	 * fail and the AIL will shut down the filesystem.
+	 *
+	 * To avoid this, save the old incore btree height values as the alt
+	 * height values before re-initializing the perag info from the updated
+	 * AGF to capture all the new values.
+	 */
+	pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi];
+	pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi];
+
+	/* Reinitialize with the values we just logged. */
+	return xrep_reinit_pagf(sc);
+}
+
+/*
+ * Use the collected free space information to stage new free space btrees.
+ * If this is successful we'll return with the new btree root
+ * information logged to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_abt_build_new_trees(
+	struct xrep_abt		*ra)
+{
+	struct xfs_scrub	*sc = ra->sc;
+	struct xfs_btree_cur	*bno_cur;
+	struct xfs_btree_cur	*cnt_cur;
+	struct xfs_perag	*pag = sc->sa.pag;
+	bool			needs_resort = false;
+	int			error;
+
+	/*
+	 * Sort the free extents by length so that we can set up the free space
+	 * btrees in as few extents as possible.  This reduces the amount of
+	 * deferred rmap / free work we have to do at the end.
+	 */
+	error = xrep_cntbt_sort_records(ra, false);
+	if (error)
+		return error;
+
+	/*
+	 * Prepare to construct the new btree by reserving disk space for the
+	 * new btree and setting up all the accounting information we'll need
+	 * to root the new btree while it's under construction and before we
+	 * attach it to the AG header.
+	 */
+	xrep_newbt_init_bare(&ra->new_bnobt, sc);
+	xrep_newbt_init_bare(&ra->new_cntbt, sc);
+
+	ra->new_bnobt.bload.get_records = xrep_abt_get_records;
+	ra->new_cntbt.bload.get_records = xrep_abt_get_records;
+
+	ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
+	ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
+
+	/* Allocate cursors for the staged btrees. */
+	bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake,
+			pag, XFS_BTNUM_BNO);
+	cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake,
+			pag, XFS_BTNUM_CNT);
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		goto err_cur;
+
+	/* Reserve the space we'll need for the new btrees. */
+	error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
+	if (error)
+		goto err_cur;
+
+	/*
+	 * If we need to re-sort the free extents by length, do so so that we
+	 * can put the records into the cntbt in the correct order.
+	 */
+	if (needs_resort) {
+		error = xrep_cntbt_sort_records(ra, needs_resort);
+		if (error)
+			goto err_cur;
+	}
+
+	/*
+	 * Due to btree slack factors, it's possible for a new btree to be one
+	 * level taller than the old btree.  Update the alternate incore btree
+	 * height so that we don't trip the verifiers when writing the new
+	 * btree blocks to disk.
+	 */
+	pag->pagf_repair_levels[XFS_BTNUM_BNOi] =
+					ra->new_bnobt.bload.btree_height;
+	pag->pagf_repair_levels[XFS_BTNUM_CNTi] =
+					ra->new_cntbt.bload.btree_height;
+
+	/* Load the free space by length tree. */
+	ra->array_cur = XFARRAY_CURSOR_INIT;
+	ra->longest = 0;
+	error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
+	if (error)
+		goto err_levels;
+
+	error = xrep_bnobt_sort_records(ra);
+	if (error)
+		return error;
+
+	/* Load the free space by block number tree. */
+	ra->array_cur = XFARRAY_CURSOR_INIT;
+	error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
+	if (error)
+		goto err_levels;
+
+	/*
+	 * Install the new btrees in the AG header.  After this point the old
+	 * btrees are no longer accessible and the new trees are live.
+	 */
+	xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
+	xfs_btree_del_cursor(bno_cur, 0);
+	xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
+	xfs_btree_del_cursor(cnt_cur, 0);
+
+	/* Reset the AGF counters now that we've changed the btree shape. */
+	error = xrep_abt_reset_counters(ra);
+	if (error)
+		goto err_newbt;
+
+	/* Dispose of any unused blocks and the accounting information. */
+	xrep_abt_dispose_reservations(ra, error);
+
+	return xrep_roll_ag_trans(sc);
+
+err_levels:
+	pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
+	pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+err_cur:
+	xfs_btree_del_cursor(cnt_cur, error);
+	xfs_btree_del_cursor(bno_cur, error);
+err_newbt:
+	xrep_abt_dispose_reservations(ra, error);
+	return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_abt_remove_old_trees(
+	struct xrep_abt		*ra)
+{
+	struct xfs_perag	*pag = ra->sc->sa.pag;
+	int			error;
+
+	/* Free the old btree blocks if they're not in use. */
+	error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
+			&XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
+	if (error)
+		return error;
+
+	/*
+	 * Now that we've zapped all the old allocbt blocks we can turn off
+	 * the alternate height mechanism.
+	 */
+	pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
+	pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+	return 0;
+}
+
+/* Repair the freespace btrees for some AG. */
+int
+xrep_allocbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_abt		*ra;
+	struct xfs_mount	*mp = sc->mp;
+	char			*descr;
+	int			error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_has_rmapbt(mp))
+		return -EOPNOTSUPP;
+
+	ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
+	if (!ra)
+		return -ENOMEM;
+	ra->sc = sc;
+
+	/* We rebuild both data structures. */
+	sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
+
+	/*
+	 * Make sure the busy extent list is clear because we can't put extents
+	 * on there twice.  In theory we cleared this before we started, but
+	 * let's not risk the filesystem.
+	 */
+	if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
+		error = -EDEADLOCK;
+		goto out_ra;
+	}
+
+	/* Set up enough storage to handle maximally fragmented free space. */
+	descr = xchk_xfile_ag_descr(sc, "free space records");
+	error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
+			sizeof(struct xfs_alloc_rec_incore),
+			&ra->free_records);
+	kfree(descr);
+	if (error)
+		goto out_ra;
+
+	/* Collect the free space data and find the old btree blocks. */
+	xagb_bitmap_init(&ra->old_allocbt_blocks);
+	error = xrep_abt_find_freespace(ra);
+	if (error)
+		goto out_bitmap;
+
+	/* Rebuild the free space information. */
+	error = xrep_abt_build_new_trees(ra);
+	if (error)
+		goto out_bitmap;
+
+	/* Kill the old trees. */
+	error = xrep_abt_remove_old_trees(ra);
+	if (error)
+		goto out_bitmap;
+
+out_bitmap:
+	xagb_bitmap_destroy(&ra->old_allocbt_blocks);
+	xfarray_destroy(ra->free_records);
+out_ra:
+	kfree(ra);
+	return error;
+}
+
+/* Make sure both btrees are ok after we've rebuilt them. */
+int
+xrep_revalidate_allocbt(
+	struct xfs_scrub	*sc)
+{
+	__u32			old_type = sc->sm->sm_type;
+	int			error;
+
+	/*
+	 * We must update sm_type temporarily so that the tree-to-tree cross
+	 * reference checks will work in the correct direction, and also so
+	 * that tracing will report correctly if there are more errors.
+	 */
+	sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
+	error = xchk_allocbt(sc);
+	if (error)
+		goto out;
+
+	sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
+	error = xchk_allocbt(sc);
+out:
+	sc->sm->sm_type = old_type;
+	return error;
+}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 6c16d9530cca..83c7feb38714 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -527,28 +527,23 @@ xchk_xattr_check_sf(
 	struct xfs_scrub		*sc)
 {
 	struct xchk_xattr_buf		*ab = sc->buf;
-	struct xfs_attr_shortform	*sf;
-	struct xfs_attr_sf_entry	*sfe;
+	struct xfs_ifork		*ifp = &sc->ip->i_af;
+	struct xfs_attr_sf_hdr		*sf = ifp->if_data;
+	struct xfs_attr_sf_entry	*sfe = xfs_attr_sf_firstentry(sf);
 	struct xfs_attr_sf_entry	*next;
-	struct xfs_ifork		*ifp;
-	unsigned char			*end;
+	unsigned char			*end = ifp->if_data + ifp->if_bytes;
 	int				i;
 	int				error = 0;
 
-	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
-
 	bitmap_zero(ab->usedmap, ifp->if_bytes);
-	sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data;
-	end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes;
-	xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr));
+	xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*sf));
 
-	sfe = &sf->list[0];
 	if ((unsigned char *)sfe > end) {
 		xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
 		return 0;
 	}
 
-	for (i = 0; i < sf->hdr.count; i++) {
+	for (i = 0; i < sf->count; i++) {
 		unsigned char		*name = sfe->nameval;
 		unsigned char		*value = &sfe->nameval[sfe->namelen];
 
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index e0c89a9a0ca0..1449bb5262d9 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -16,7 +16,9 @@
 
 #include <linux/interval_tree_generic.h>
 
-struct xbitmap_node {
+/* u64 bitmap */
+
+struct xbitmap64_node {
 	struct rb_node	bn_rbnode;
 
 	/* First set bit of this interval and subtree. */
@@ -39,72 +41,72 @@ struct xbitmap_node {
  * forward-declare them anyway for clarity.
  */
 static inline void
-xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root);
+xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root);
 
 static inline void
-xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root);
+xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root);
 
-static inline struct xbitmap_node *
-xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start,
+static inline struct xbitmap64_node *
+xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start,
 			uint64_t last);
 
-static inline struct xbitmap_node *
-xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start,
+static inline struct xbitmap64_node *
+xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start,
 		       uint64_t last);
 
-INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t,
-		__bn_subtree_last, START, LAST, static inline, xbitmap_tree)
+INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t,
+		__bn_subtree_last, START, LAST, static inline, xbitmap64_tree)
 
 /* Iterate each interval of a bitmap.  Do not change the bitmap. */
-#define for_each_xbitmap_extent(bn, bitmap) \
+#define for_each_xbitmap64_extent(bn, bitmap) \
 	for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
-				   struct xbitmap_node, bn_rbnode); \
+				   struct xbitmap64_node, bn_rbnode); \
 	     (bn) != NULL; \
 	     (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
-				   struct xbitmap_node, bn_rbnode))
+				   struct xbitmap64_node, bn_rbnode))
 
 /* Clear a range of this bitmap. */
 int
-xbitmap_clear(
-	struct xbitmap		*bitmap,
+xbitmap64_clear(
+	struct xbitmap64	*bitmap,
 	uint64_t		start,
 	uint64_t		len)
 {
-	struct xbitmap_node	*bn;
-	struct xbitmap_node	*new_bn;
+	struct xbitmap64_node	*bn;
+	struct xbitmap64_node	*new_bn;
 	uint64_t		last = start + len - 1;
 
-	while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) {
+	while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) {
 		if (bn->bn_start < start && bn->bn_last > last) {
 			uint64_t	old_last = bn->bn_last;
 
 			/* overlaps with the entire clearing range */
-			xbitmap_tree_remove(bn, &bitmap->xb_root);
+			xbitmap64_tree_remove(bn, &bitmap->xb_root);
 			bn->bn_last = start - 1;
-			xbitmap_tree_insert(bn, &bitmap->xb_root);
+			xbitmap64_tree_insert(bn, &bitmap->xb_root);
 
 			/* add an extent */
-			new_bn = kmalloc(sizeof(struct xbitmap_node),
+			new_bn = kmalloc(sizeof(struct xbitmap64_node),
 					XCHK_GFP_FLAGS);
 			if (!new_bn)
 				return -ENOMEM;
 			new_bn->bn_start = last + 1;
 			new_bn->bn_last = old_last;
-			xbitmap_tree_insert(new_bn, &bitmap->xb_root);
+			xbitmap64_tree_insert(new_bn, &bitmap->xb_root);
 		} else if (bn->bn_start < start) {
 			/* overlaps with the left side of the clearing range */
-			xbitmap_tree_remove(bn, &bitmap->xb_root);
+			xbitmap64_tree_remove(bn, &bitmap->xb_root);
 			bn->bn_last = start - 1;
-			xbitmap_tree_insert(bn, &bitmap->xb_root);
+			xbitmap64_tree_insert(bn, &bitmap->xb_root);
 		} else if (bn->bn_last > last) {
 			/* overlaps with the right side of the clearing range */
-			xbitmap_tree_remove(bn, &bitmap->xb_root);
+			xbitmap64_tree_remove(bn, &bitmap->xb_root);
 			bn->bn_start = last + 1;
-			xbitmap_tree_insert(bn, &bitmap->xb_root);
+			xbitmap64_tree_insert(bn, &bitmap->xb_root);
 			break;
 		} else {
 			/* in the middle of the clearing range */
-			xbitmap_tree_remove(bn, &bitmap->xb_root);
+			xbitmap64_tree_remove(bn, &bitmap->xb_root);
 			kfree(bn);
 		}
 	}
@@ -114,59 +116,59 @@ xbitmap_clear(
 
 /* Set a range of this bitmap. */
 int
-xbitmap_set(
-	struct xbitmap		*bitmap,
+xbitmap64_set(
+	struct xbitmap64	*bitmap,
 	uint64_t		start,
 	uint64_t		len)
 {
-	struct xbitmap_node	*left;
-	struct xbitmap_node	*right;
+	struct xbitmap64_node	*left;
+	struct xbitmap64_node	*right;
 	uint64_t		last = start + len - 1;
 	int			error;
 
 	/* Is this whole range already set? */
-	left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+	left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last);
 	if (left && left->bn_start <= start && left->bn_last >= last)
 		return 0;
 
 	/* Clear out everything in the range we want to set. */
-	error = xbitmap_clear(bitmap, start, len);
+	error = xbitmap64_clear(bitmap, start, len);
 	if (error)
 		return error;
 
 	/* Do we have a left-adjacent extent? */
-	left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
+	left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
 	ASSERT(!left || left->bn_last + 1 == start);
 
 	/* Do we have a right-adjacent extent? */
-	right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
+	right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
 	ASSERT(!right || right->bn_start == last + 1);
 
 	if (left && right) {
 		/* combine left and right adjacent extent */
-		xbitmap_tree_remove(left, &bitmap->xb_root);
-		xbitmap_tree_remove(right, &bitmap->xb_root);
+		xbitmap64_tree_remove(left, &bitmap->xb_root);
+		xbitmap64_tree_remove(right, &bitmap->xb_root);
 		left->bn_last = right->bn_last;
-		xbitmap_tree_insert(left, &bitmap->xb_root);
+		xbitmap64_tree_insert(left, &bitmap->xb_root);
 		kfree(right);
 	} else if (left) {
 		/* combine with left extent */
-		xbitmap_tree_remove(left, &bitmap->xb_root);
+		xbitmap64_tree_remove(left, &bitmap->xb_root);
 		left->bn_last = last;
-		xbitmap_tree_insert(left, &bitmap->xb_root);
+		xbitmap64_tree_insert(left, &bitmap->xb_root);
 	} else if (right) {
 		/* combine with right extent */
-		xbitmap_tree_remove(right, &bitmap->xb_root);
+		xbitmap64_tree_remove(right, &bitmap->xb_root);
 		right->bn_start = start;
-		xbitmap_tree_insert(right, &bitmap->xb_root);
+		xbitmap64_tree_insert(right, &bitmap->xb_root);
 	} else {
 		/* add an extent */
-		left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS);
+		left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS);
 		if (!left)
 			return -ENOMEM;
 		left->bn_start = start;
 		left->bn_last = last;
-		xbitmap_tree_insert(left, &bitmap->xb_root);
+		xbitmap64_tree_insert(left, &bitmap->xb_root);
 	}
 
 	return 0;
@@ -174,21 +176,21 @@ xbitmap_set(
 
 /* Free everything related to this bitmap. */
 void
-xbitmap_destroy(
-	struct xbitmap		*bitmap)
+xbitmap64_destroy(
+	struct xbitmap64	*bitmap)
 {
-	struct xbitmap_node	*bn;
+	struct xbitmap64_node	*bn;
 
-	while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
-		xbitmap_tree_remove(bn, &bitmap->xb_root);
+	while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
+		xbitmap64_tree_remove(bn, &bitmap->xb_root);
 		kfree(bn);
 	}
 }
 
 /* Set up a per-AG block bitmap. */
 void
-xbitmap_init(
-	struct xbitmap		*bitmap)
+xbitmap64_init(
+	struct xbitmap64	*bitmap)
 {
 	bitmap->xb_root = RB_ROOT_CACHED;
 }
@@ -208,18 +210,18 @@ xbitmap_init(
  * This is the logical equivalent of bitmap &= ~sub.
  */
 int
-xbitmap_disunion(
-	struct xbitmap		*bitmap,
-	struct xbitmap		*sub)
+xbitmap64_disunion(
+	struct xbitmap64	*bitmap,
+	struct xbitmap64	*sub)
 {
-	struct xbitmap_node	*bn;
+	struct xbitmap64_node	*bn;
 	int			error;
 
-	if (xbitmap_empty(bitmap) || xbitmap_empty(sub))
+	if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub))
 		return 0;
 
-	for_each_xbitmap_extent(bn, sub) {
-		error = xbitmap_clear(bitmap, bn->bn_start,
+	for_each_xbitmap64_extent(bn, sub) {
+		error = xbitmap64_clear(bitmap, bn->bn_start,
 				bn->bn_last - bn->bn_start + 1);
 		if (error)
 			return error;
@@ -228,88 +230,273 @@ xbitmap_disunion(
 	return 0;
 }
 
+/* How many bits are set in this bitmap? */
+uint64_t
+xbitmap64_hweight(
+	struct xbitmap64	*bitmap)
+{
+	struct xbitmap64_node	*bn;
+	uint64_t		ret = 0;
+
+	for_each_xbitmap64_extent(bn, bitmap)
+		ret += bn->bn_last - bn->bn_start + 1;
+
+	return ret;
+}
+
+/* Call a function for every run of set bits in this bitmap. */
+int
+xbitmap64_walk(
+	struct xbitmap64	*bitmap,
+	xbitmap64_walk_fn		fn,
+	void			*priv)
+{
+	struct xbitmap64_node	*bn;
+	int			error = 0;
+
+	for_each_xbitmap64_extent(bn, bitmap) {
+		error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+
+/* Does this bitmap have no bits set at all? */
+bool
+xbitmap64_empty(
+	struct xbitmap64	*bitmap)
+{
+	return bitmap->xb_root.rb_root.rb_node == NULL;
+}
+
+/* Is the start of the range set or clear?  And for how long? */
+bool
+xbitmap64_test(
+	struct xbitmap64	*bitmap,
+	uint64_t		start,
+	uint64_t		*len)
+{
+	struct xbitmap64_node	*bn;
+	uint64_t		last = start + *len - 1;
+
+	bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last);
+	if (!bn)
+		return false;
+	if (bn->bn_start <= start) {
+		if (bn->bn_last < last)
+			*len = bn->bn_last - start + 1;
+		return true;
+	}
+	*len = bn->bn_start - start;
+	return false;
+}
+
+/* u32 bitmap */
+
+struct xbitmap32_node {
+	struct rb_node	bn_rbnode;
+
+	/* First set bit of this interval and subtree. */
+	uint32_t	bn_start;
+
+	/* Last set bit of this interval. */
+	uint32_t	bn_last;
+
+	/* Last set bit of this subtree.  Do not touch this. */
+	uint32_t	__bn_subtree_last;
+};
+
+/* Define our own interval tree type with uint32_t parameters. */
+
 /*
- * Record all btree blocks seen while iterating all records of a btree.
- *
- * We know that the btree query_all function starts at the left edge and walks
- * towards the right edge of the tree.  Therefore, we know that we can walk up
- * the btree cursor towards the root; if the pointer for a given level points
- * to the first record/key in that block, we haven't seen this block before;
- * and therefore we need to remember that we saw this block in the btree.
- *
- * So if our btree is:
- *
- *    4
- *  / | \
- * 1  2  3
- *
- * Pretend for this example that each leaf block has 100 btree records.  For
- * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
- * record that we saw block 1.  Then we observe that bc_levels[1].ptr == 1, so
- * we record block 4.  The list is [1, 4].
- *
- * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
- * the loop.  The list remains [1, 4].
- *
- * For the 101st btree record, we've moved onto leaf block 2.  Now
- * bc_levels[0].ptr == 1 again, so we record that we saw block 2.  We see that
- * bc_levels[1].ptr == 2, so we exit the loop.  The list is now [1, 4, 2].
- *
- * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
- *
- * For the 201st record, we've moved on to leaf block 3.
- * bc_levels[0].ptr == 1, so we add 3 to the list.  Now it is [1, 4, 2, 3].
- *
- * For the 300th record we just exit, with the list being [1, 4, 2, 3].
+ * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
+ * forward-declare them anyway for clarity.
  */
+static inline void
+xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root);
 
-/* Mark a btree block to the agblock bitmap. */
-STATIC int
-xagb_bitmap_visit_btblock(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	void			*priv)
+static inline void
+xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root);
+
+static inline struct xbitmap32_node *
+xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start,
+			  uint32_t last);
+
+static inline struct xbitmap32_node *
+xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start,
+			 uint32_t last);
+
+INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t,
+		__bn_subtree_last, START, LAST, static inline, xbitmap32_tree)
+
+/* Iterate each interval of a bitmap.  Do not change the bitmap. */
+#define for_each_xbitmap32_extent(bn, bitmap) \
+	for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
+				   struct xbitmap32_node, bn_rbnode); \
+	     (bn) != NULL; \
+	     (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
+				   struct xbitmap32_node, bn_rbnode))
+
+/* Clear a range of this bitmap. */
+int
+xbitmap32_clear(
+	struct xbitmap32	*bitmap,
+	uint32_t		start,
+	uint32_t		len)
 {
-	struct xagb_bitmap	*bitmap = priv;
-	struct xfs_buf		*bp;
-	xfs_fsblock_t		fsbno;
-	xfs_agblock_t		agbno;
+	struct xbitmap32_node	*bn;
+	struct xbitmap32_node	*new_bn;
+	uint32_t		last = start + len - 1;
 
-	xfs_btree_get_block(cur, level, &bp);
-	if (!bp)
-		return 0;
+	while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) {
+		if (bn->bn_start < start && bn->bn_last > last) {
+			uint32_t	old_last = bn->bn_last;
 
-	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
-	agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+			/* overlaps with the entire clearing range */
+			xbitmap32_tree_remove(bn, &bitmap->xb_root);
+			bn->bn_last = start - 1;
+			xbitmap32_tree_insert(bn, &bitmap->xb_root);
 
-	return xagb_bitmap_set(bitmap, agbno, 1);
+			/* add an extent */
+			new_bn = kmalloc(sizeof(struct xbitmap32_node),
+					XCHK_GFP_FLAGS);
+			if (!new_bn)
+				return -ENOMEM;
+			new_bn->bn_start = last + 1;
+			new_bn->bn_last = old_last;
+			xbitmap32_tree_insert(new_bn, &bitmap->xb_root);
+		} else if (bn->bn_start < start) {
+			/* overlaps with the left side of the clearing range */
+			xbitmap32_tree_remove(bn, &bitmap->xb_root);
+			bn->bn_last = start - 1;
+			xbitmap32_tree_insert(bn, &bitmap->xb_root);
+		} else if (bn->bn_last > last) {
+			/* overlaps with the right side of the clearing range */
+			xbitmap32_tree_remove(bn, &bitmap->xb_root);
+			bn->bn_start = last + 1;
+			xbitmap32_tree_insert(bn, &bitmap->xb_root);
+			break;
+		} else {
+			/* in the middle of the clearing range */
+			xbitmap32_tree_remove(bn, &bitmap->xb_root);
+			kfree(bn);
+		}
+	}
+
+	return 0;
 }
 
-/* Mark all (per-AG) btree blocks in the agblock bitmap. */
+/* Set a range of this bitmap. */
 int
-xagb_bitmap_set_btblocks(
-	struct xagb_bitmap	*bitmap,
-	struct xfs_btree_cur	*cur)
+xbitmap32_set(
+	struct xbitmap32	*bitmap,
+	uint32_t		start,
+	uint32_t		len)
 {
-	return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
-			XFS_BTREE_VISIT_ALL, bitmap);
+	struct xbitmap32_node	*left;
+	struct xbitmap32_node	*right;
+	uint32_t		last = start + len - 1;
+	int			error;
+
+	/* Is this whole range already set? */
+	left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last);
+	if (left && left->bn_start <= start && left->bn_last >= last)
+		return 0;
+
+	/* Clear out everything in the range we want to set. */
+	error = xbitmap32_clear(bitmap, start, len);
+	if (error)
+		return error;
+
+	/* Do we have a left-adjacent extent? */
+	left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
+	ASSERT(!left || left->bn_last + 1 == start);
+
+	/* Do we have a right-adjacent extent? */
+	right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
+	ASSERT(!right || right->bn_start == last + 1);
+
+	if (left && right) {
+		/* combine left and right adjacent extent */
+		xbitmap32_tree_remove(left, &bitmap->xb_root);
+		xbitmap32_tree_remove(right, &bitmap->xb_root);
+		left->bn_last = right->bn_last;
+		xbitmap32_tree_insert(left, &bitmap->xb_root);
+		kfree(right);
+	} else if (left) {
+		/* combine with left extent */
+		xbitmap32_tree_remove(left, &bitmap->xb_root);
+		left->bn_last = last;
+		xbitmap32_tree_insert(left, &bitmap->xb_root);
+	} else if (right) {
+		/* combine with right extent */
+		xbitmap32_tree_remove(right, &bitmap->xb_root);
+		right->bn_start = start;
+		xbitmap32_tree_insert(right, &bitmap->xb_root);
+	} else {
+		/* add an extent */
+		left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS);
+		if (!left)
+			return -ENOMEM;
+		left->bn_start = start;
+		left->bn_last = last;
+		xbitmap32_tree_insert(left, &bitmap->xb_root);
+	}
+
+	return 0;
+}
+
+/* Free everything related to this bitmap. */
+void
+xbitmap32_destroy(
+	struct xbitmap32	*bitmap)
+{
+	struct xbitmap32_node	*bn;
+
+	while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) {
+		xbitmap32_tree_remove(bn, &bitmap->xb_root);
+		kfree(bn);
+	}
+}
+
+/* Set up a per-AG block bitmap. */
+void
+xbitmap32_init(
+	struct xbitmap32	*bitmap)
+{
+	bitmap->xb_root = RB_ROOT_CACHED;
 }
 
 /*
- * Record all the buffers pointed to by the btree cursor.  Callers already
- * engaged in a btree walk should call this function to capture the list of
- * blocks going from the leaf towards the root.
+ * Remove all the blocks mentioned in @sub from the extents in @bitmap.
+ *
+ * The intent is that callers will iterate the rmapbt for all of its records
+ * for a given owner to generate @bitmap; and iterate all the blocks of the
+ * metadata structures that are not being rebuilt and have the same rmapbt
+ * owner to generate @sub.  This routine subtracts all the extents
+ * mentioned in sub from all the extents linked in @bitmap, which leaves
+ * @bitmap as the list of blocks that are not accounted for, which we assume
+ * are the dead blocks of the old metadata structure.  The blocks mentioned in
+ * @bitmap can be reaped.
+ *
+ * This is the logical equivalent of bitmap &= ~sub.
  */
 int
-xagb_bitmap_set_btcur_path(
-	struct xagb_bitmap	*bitmap,
-	struct xfs_btree_cur	*cur)
+xbitmap32_disunion(
+	struct xbitmap32	*bitmap,
+	struct xbitmap32	*sub)
 {
-	int			i;
+	struct xbitmap32_node	*bn;
 	int			error;
 
-	for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
-		error = xagb_bitmap_visit_btblock(cur, i, bitmap);
+	if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub))
+		return 0;
+
+	for_each_xbitmap32_extent(bn, sub) {
+		error = xbitmap32_clear(bitmap, bn->bn_start,
+				bn->bn_last - bn->bn_start + 1);
 		if (error)
 			return error;
 	}
@@ -318,14 +505,14 @@ xagb_bitmap_set_btcur_path(
 }
 
 /* How many bits are set in this bitmap? */
-uint64_t
-xbitmap_hweight(
-	struct xbitmap		*bitmap)
+uint32_t
+xbitmap32_hweight(
+	struct xbitmap32	*bitmap)
 {
-	struct xbitmap_node	*bn;
-	uint64_t		ret = 0;
+	struct xbitmap32_node	*bn;
+	uint32_t		ret = 0;
 
-	for_each_xbitmap_extent(bn, bitmap)
+	for_each_xbitmap32_extent(bn, bitmap)
 		ret += bn->bn_last - bn->bn_start + 1;
 
 	return ret;
@@ -333,15 +520,15 @@ xbitmap_hweight(
 
 /* Call a function for every run of set bits in this bitmap. */
 int
-xbitmap_walk(
-	struct xbitmap		*bitmap,
-	xbitmap_walk_fn		fn,
+xbitmap32_walk(
+	struct xbitmap32	*bitmap,
+	xbitmap32_walk_fn	fn,
 	void			*priv)
 {
-	struct xbitmap_node	*bn;
+	struct xbitmap32_node	*bn;
 	int			error = 0;
 
-	for_each_xbitmap_extent(bn, bitmap) {
+	for_each_xbitmap32_extent(bn, bitmap) {
 		error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
 		if (error)
 			break;
@@ -352,23 +539,23 @@ xbitmap_walk(
 
 /* Does this bitmap have no bits set at all? */
 bool
-xbitmap_empty(
-	struct xbitmap		*bitmap)
+xbitmap32_empty(
+	struct xbitmap32	*bitmap)
 {
 	return bitmap->xb_root.rb_root.rb_node == NULL;
 }
 
 /* Is the start of the range set or clear?  And for how long? */
 bool
-xbitmap_test(
-	struct xbitmap		*bitmap,
-	uint64_t		start,
-	uint64_t		*len)
+xbitmap32_test(
+	struct xbitmap32	*bitmap,
+	uint32_t		start,
+	uint32_t		*len)
 {
-	struct xbitmap_node	*bn;
-	uint64_t		last = start + *len - 1;
+	struct xbitmap32_node	*bn;
+	uint32_t		last = start + *len - 1;
 
-	bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+	bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last);
 	if (!bn)
 		return false;
 	if (bn->bn_start <= start) {
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 4fe58bad6734..2df8911606d6 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -6,17 +6,19 @@
 #ifndef __XFS_SCRUB_BITMAP_H__
 #define __XFS_SCRUB_BITMAP_H__
 
-struct xbitmap {
+/* u64 bitmap */
+
+struct xbitmap64 {
 	struct rb_root_cached	xb_root;
 };
 
-void xbitmap_init(struct xbitmap *bitmap);
-void xbitmap_destroy(struct xbitmap *bitmap);
+void xbitmap64_init(struct xbitmap64 *bitmap);
+void xbitmap64_destroy(struct xbitmap64 *bitmap);
 
-int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len);
-int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
-int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
-uint64_t xbitmap_hweight(struct xbitmap *bitmap);
+int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len);
+int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len);
+int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub);
+uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap);
 
 /*
  * Return codes for the bitmap iterator functions are 0 to continue iterating,
@@ -25,84 +27,39 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap);
  * iteration, because neither bitmap iterator ever generates that error code on
  * its own.  Callers must not modify the bitmap while walking it.
  */
-typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv);
-int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn,
+typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv);
+int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn,
 		void *priv);
 
-bool xbitmap_empty(struct xbitmap *bitmap);
-bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
+bool xbitmap64_empty(struct xbitmap64 *bitmap);
+bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len);
 
-/* Bitmaps, but for type-checked for xfs_agblock_t */
+/* u32 bitmap */
 
-struct xagb_bitmap {
-	struct xbitmap	agbitmap;
+struct xbitmap32 {
+	struct rb_root_cached	xb_root;
 };
 
-static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
-{
-	xbitmap_init(&bitmap->agbitmap);
-}
-
-static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
-{
-	xbitmap_destroy(&bitmap->agbitmap);
-}
-
-static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
-		xfs_agblock_t start, xfs_extlen_t len)
-{
-	return xbitmap_clear(&bitmap->agbitmap, start, len);
-}
-static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
-		xfs_agblock_t start, xfs_extlen_t len)
-{
-	return xbitmap_set(&bitmap->agbitmap, start, len);
-}
-
-static inline bool
-xagb_bitmap_test(
-	struct xagb_bitmap	*bitmap,
-	xfs_agblock_t		start,
-	xfs_extlen_t		*len)
-{
-	uint64_t		biglen = *len;
-	bool			ret;
-
-	ret = xbitmap_test(&bitmap->agbitmap, start, &biglen);
-
-	if (start + biglen >= UINT_MAX) {
-		ASSERT(0);
-		biglen = UINT_MAX - start;
-	}
-
-	*len = biglen;
-	return ret;
-}
-
-static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
-		struct xagb_bitmap *sub)
-{
-	return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap);
-}
+void xbitmap32_init(struct xbitmap32 *bitmap);
+void xbitmap32_destroy(struct xbitmap32 *bitmap);
 
-static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
-{
-	return xbitmap_hweight(&bitmap->agbitmap);
-}
-static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
-{
-	return xbitmap_empty(&bitmap->agbitmap);
-}
+int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len);
+int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len);
+int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub);
+uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap);
 
-static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
-		xbitmap_walk_fn fn, void *priv)
-{
-	return xbitmap_walk(&bitmap->agbitmap, fn, priv);
-}
+/*
+ * Return codes for the bitmap iterator functions are 0 to continue iterating,
+ * and non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * iteration caller.  The special value -ECANCELED can be used to stop
+ * iteration, because neither bitmap iterator ever generates that error code on
+ * its own.  Callers must not modify the bitmap while walking it.
+ */
+typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv);
+int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn,
+		void *priv);
 
-int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
-		struct xfs_btree_cur *cur);
-int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
-		struct xfs_btree_cur *cur);
+bool xbitmap32_empty(struct xbitmap32 *bitmap);
+bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len);
 
 #endif	/* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 06d8c1996a33..b169cddde6da 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -19,9 +19,11 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_health.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
+#include "scrub/health.h"
 #include "xfs_ag.h"
 
 /* Set us up with an inode's bmap. */
@@ -48,9 +50,18 @@ xchk_setup_inode_bmap(
 	if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
 	    sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) {
 		struct address_space	*mapping = VFS_I(sc->ip)->i_mapping;
+		bool			is_repair = xchk_could_repair(sc);
 
 		xchk_ilock(sc, XFS_MMAPLOCK_EXCL);
 
+		/* Break all our leases, we're going to mess with things. */
+		if (is_repair) {
+			error = xfs_break_layouts(VFS_I(sc->ip),
+					&sc->ilock_flags, BREAK_WRITE);
+			if (error)
+				goto out;
+		}
+
 		inode_dio_wait(VFS_I(sc->ip));
 
 		/*
@@ -71,6 +82,15 @@ xchk_setup_inode_bmap(
 			error = filemap_fdatawait_keep_errors(mapping);
 		if (error && (error != -ENOSPC && error != -EIO))
 			goto out;
+
+		/* Drop the page cache if we're repairing block mappings. */
+		if (is_repair) {
+			error = invalidate_inode_pages2(
+					VFS_I(sc->ip)->i_mapping);
+			if (error)
+				goto out;
+		}
+
 	}
 
 	/* Got the inode, lock it and we're ready to go. */
@@ -78,6 +98,10 @@ xchk_setup_inode_bmap(
 	if (error)
 		goto out;
 
+	error = xchk_ino_dqattach(sc);
+	if (error)
+		goto out;
+
 	xchk_ilock(sc, XFS_ILOCK_EXCL);
 out:
 	/* scrub teardown will unlock and release the inode */
@@ -633,6 +657,82 @@ xchk_bmap_check_ag_rmaps(
 }
 
 /*
+ * Decide if we want to scan the reverse mappings to determine if the attr
+ * fork /really/ has zero space mappings.
+ */
+STATIC bool
+xchk_bmap_check_empty_attrfork(
+	struct xfs_inode	*ip)
+{
+	struct xfs_ifork	*ifp = &ip->i_af;
+
+	/*
+	 * If the dinode repair found a bad attr fork, it will reset the fork
+	 * to extents format with zero records and wait for the this scrubber
+	 * to reconstruct the block mappings.  If the fork is not in this
+	 * state, then the fork cannot have been zapped.
+	 */
+	if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0)
+		return false;
+
+	/*
+	 * Files can have an attr fork in EXTENTS format with zero records for
+	 * several reasons:
+	 *
+	 * a) an attr set created a fork but ran out of space
+	 * b) attr replace deleted an old attr but failed during the set step
+	 * c) the data fork was in btree format when all attrs were deleted, so
+	 *    the fork was left in place
+	 * d) the inode repair code zapped the fork
+	 *
+	 * Only in case (d) do we want to scan the rmapbt to see if we need to
+	 * rebuild the attr fork.  The fork zap code clears all DAC permission
+	 * bits and zeroes the uid and gid, so avoid the scan if any of those
+	 * three conditions are not met.
+	 */
+	if ((VFS_I(ip)->i_mode & 0777) != 0)
+		return false;
+	if (!uid_eq(VFS_I(ip)->i_uid, GLOBAL_ROOT_UID))
+		return false;
+	if (!gid_eq(VFS_I(ip)->i_gid, GLOBAL_ROOT_GID))
+		return false;
+
+	return true;
+}
+
+/*
+ * Decide if we want to scan the reverse mappings to determine if the data
+ * fork /really/ has zero space mappings.
+ */
+STATIC bool
+xchk_bmap_check_empty_datafork(
+	struct xfs_inode	*ip)
+{
+	struct xfs_ifork	*ifp = &ip->i_df;
+
+	/* Don't support realtime rmap checks yet. */
+	if (XFS_IS_REALTIME_INODE(ip))
+		return false;
+
+	/*
+	 * If the dinode repair found a bad data fork, it will reset the fork
+	 * to extents format with zero records and wait for the this scrubber
+	 * to reconstruct the block mappings.  If the fork is not in this
+	 * state, then the fork cannot have been zapped.
+	 */
+	if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0)
+		return false;
+
+	/*
+	 * If we encounter an empty data fork along with evidence that the fork
+	 * might not really be empty, we need to scan the reverse mappings to
+	 * decide if we're going to rebuild the fork.  Data forks with nonzero
+	 * file size are scanned.
+	 */
+	return i_size_read(VFS_I(ip)) != 0;
+}
+
+/*
  * Decide if we want to walk every rmap btree in the fs to make sure that each
  * rmap for this file fork has corresponding bmbt entries.
  */
@@ -641,7 +741,6 @@ xchk_bmap_want_check_rmaps(
 	struct xchk_bmap_info	*info)
 {
 	struct xfs_scrub	*sc = info->sc;
-	struct xfs_ifork	*ifp;
 
 	if (!xfs_has_rmapbt(sc->mp))
 		return false;
@@ -650,28 +749,10 @@ xchk_bmap_want_check_rmaps(
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 		return false;
 
-	/* Don't support realtime rmap checks yet. */
-	if (info->is_rt)
-		return false;
-
-	/*
-	 * The inode repair code zaps broken inode forks by resetting them back
-	 * to EXTENTS format and zero extent records.  If we encounter a fork
-	 * in this state along with evidence that the fork isn't supposed to be
-	 * empty, we need to scan the reverse mappings to decide if we're going
-	 * to rebuild the fork.  Data forks with nonzero file size are scanned.
-	 * xattr forks are never empty of content, so they are always scanned.
-	 */
-	ifp = xfs_ifork_ptr(sc->ip, info->whichfork);
-	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) {
-		if (info->whichfork == XFS_DATA_FORK &&
-		    i_size_read(VFS_I(sc->ip)) == 0)
-			return false;
-
-		return true;
-	}
+	if (info->whichfork == XFS_ATTR_FORK)
+		return xchk_bmap_check_empty_attrfork(sc->ip);
 
-	return false;
+	return xchk_bmap_check_empty_datafork(sc->ip);
 }
 
 /* Make sure each rmap has a corresponding bmbt entry. */
@@ -939,7 +1020,20 @@ int
 xchk_bmap_data(
 	struct xfs_scrub	*sc)
 {
-	return xchk_bmap(sc, XFS_DATA_FORK);
+	int			error;
+
+	if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTD_ZAPPED)) {
+		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+		return 0;
+	}
+
+	error = xchk_bmap(sc, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	/* If the data fork is clean, it is clearly not zapped. */
+	xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTD_ZAPPED);
+	return 0;
 }
 
 /* Scrub an inode's attr fork. */
@@ -947,7 +1041,27 @@ int
 xchk_bmap_attr(
 	struct xfs_scrub	*sc)
 {
-	return xchk_bmap(sc, XFS_ATTR_FORK);
+	int			error;
+
+	/*
+	 * If the attr fork has been zapped, it's possible that forkoff was
+	 * reset to zero and hence sc->ip->i_afp is NULL.  We don't want the
+	 * NULL ifp check in xchk_bmap to conclude that the attr fork is ok,
+	 * so short circuit that logic by setting the corruption flag and
+	 * returning immediately.
+	 */
+	if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTA_ZAPPED)) {
+		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+		return 0;
+	}
+
+	error = xchk_bmap(sc, XFS_ATTR_FORK);
+	if (error)
+		return error;
+
+	/* If the attr fork is clean, it is clearly not zapped. */
+	xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTA_ZAPPED);
+	return 0;
 }
 
 /* Scrub an inode's CoW fork. */
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
new file mode 100644
index 000000000000..a4bb89fdd510
--- /dev/null
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "xfs_reflink.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Inode Fork Block Mapping (BMBT) Repair
+ * ======================================
+ *
+ * Gather all the rmap records for the inode and fork we're fixing, reset the
+ * incore fork, then recreate the btree.
+ */
+
+enum reflink_scan_state {
+	RLS_IRRELEVANT = -1,	/* not applicable to this file */
+	RLS_UNKNOWN,		/* shared extent scans required */
+	RLS_SET_IFLAG,		/* iflag must be set */
+};
+
+struct xrep_bmap {
+	/* Old bmbt blocks */
+	struct xfsb_bitmap	old_bmbt_blocks;
+
+	/* New fork. */
+	struct xrep_newbt	new_bmapbt;
+
+	/* List of new bmap records. */
+	struct xfarray		*bmap_records;
+
+	struct xfs_scrub	*sc;
+
+	/* How many blocks did we find allocated to this file? */
+	xfs_rfsblock_t		nblocks;
+
+	/* How many bmbt blocks did we find for this fork? */
+	xfs_rfsblock_t		old_bmbt_block_count;
+
+	/* get_records()'s position in the free space record array. */
+	xfarray_idx_t		array_cur;
+
+	/* How many real (non-hole, non-delalloc) mappings do we have? */
+	uint64_t		real_mappings;
+
+	/* Which fork are we fixing? */
+	int			whichfork;
+
+	/* What d the REFLINK flag be set when the repair is over? */
+	enum reflink_scan_state	reflink_scan;
+
+	/* Do we allow unwritten extents? */
+	bool			allow_unwritten;
+};
+
+/* Is this space extent shared?  Flag the inode if it is. */
+STATIC int
+xrep_bmap_discover_shared(
+	struct xrep_bmap	*rb,
+	xfs_fsblock_t		startblock,
+	xfs_filblks_t		blockcount)
+{
+	struct xfs_scrub	*sc = rb->sc;
+	xfs_agblock_t		agbno;
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	int			error;
+
+	agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
+	error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount,
+			&fbno, &flen, false);
+	if (error)
+		return error;
+
+	if (fbno != NULLAGBLOCK)
+		rb->reflink_scan = RLS_SET_IFLAG;
+
+	return 0;
+}
+
+/* Remember this reverse-mapping as a series of bmap records. */
+STATIC int
+xrep_bmap_from_rmap(
+	struct xrep_bmap	*rb,
+	xfs_fileoff_t		startoff,
+	xfs_fsblock_t		startblock,
+	xfs_filblks_t		blockcount,
+	bool			unwritten)
+{
+	struct xfs_bmbt_irec	irec = {
+		.br_startoff	= startoff,
+		.br_startblock	= startblock,
+		.br_state	= unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM,
+	};
+	struct xfs_bmbt_rec	rbe;
+	struct xfs_scrub	*sc = rb->sc;
+	int			error = 0;
+
+	/*
+	 * If we're repairing the data fork of a non-reflinked regular file on
+	 * a reflink filesystem, we need to figure out if this space extent is
+	 * shared.
+	 */
+	if (rb->reflink_scan == RLS_UNKNOWN && !unwritten) {
+		error = xrep_bmap_discover_shared(rb, startblock, blockcount);
+		if (error)
+			return error;
+	}
+
+	do {
+		xfs_failaddr_t	fa;
+
+		irec.br_blockcount = min_t(xfs_filblks_t, blockcount,
+				XFS_MAX_BMBT_EXTLEN);
+
+		fa = xfs_bmap_validate_extent(sc->ip, rb->whichfork, &irec);
+		if (fa)
+			return -EFSCORRUPTED;
+
+		xfs_bmbt_disk_set_all(&rbe, &irec);
+
+		trace_xrep_bmap_found(sc->ip, rb->whichfork, &irec);
+
+		if (xchk_should_terminate(sc, &error))
+			return error;
+
+		error = xfarray_append(rb->bmap_records, &rbe);
+		if (error)
+			return error;
+
+		rb->real_mappings++;
+
+		irec.br_startblock += irec.br_blockcount;
+		irec.br_startoff += irec.br_blockcount;
+		blockcount -= irec.br_blockcount;
+	} while (blockcount > 0);
+
+	return 0;
+}
+
+/* Check for any obvious errors or conflicts in the file mapping. */
+STATIC int
+xrep_bmap_check_fork_rmap(
+	struct xrep_bmap		*rb,
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec)
+{
+	struct xfs_scrub		*sc = rb->sc;
+	enum xbtree_recpacking		outcome;
+	int				error;
+
+	/*
+	 * Data extents for rt files are never stored on the data device, but
+	 * everything else (xattrs, bmbt blocks) can be.
+	 */
+	if (XFS_IS_REALTIME_INODE(sc->ip) &&
+	    !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))
+		return -EFSCORRUPTED;
+
+	/* Check that this is within the AG. */
+	if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock,
+				rec->rm_blockcount))
+		return -EFSCORRUPTED;
+
+	/* Check the file offset range. */
+	if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+	    !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount))
+		return -EFSCORRUPTED;
+
+	/* No contradictory flags. */
+	if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) &&
+	    (rec->rm_flags & XFS_RMAP_UNWRITTEN))
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space. */
+	error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+			rec->rm_blockcount, &outcome);
+	if (error)
+		return error;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		return -EFSCORRUPTED;
+
+	/* Must not be an inode chunk. */
+	error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+			rec->rm_startblock, rec->rm_blockcount, &outcome);
+	if (error)
+		return error;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/* Record extents that belong to this inode's fork. */
+STATIC int
+xrep_bmap_walk_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_bmap		*rb = priv;
+	struct xfs_mount		*mp = cur->bc_mp;
+	xfs_fsblock_t			fsbno;
+	int				error = 0;
+
+	if (xchk_should_terminate(rb->sc, &error))
+		return error;
+
+	if (rec->rm_owner != rb->sc->ip->i_ino)
+		return 0;
+
+	error = xrep_bmap_check_fork_rmap(rb, cur, rec);
+	if (error)
+		return error;
+
+	/*
+	 * Record all blocks allocated to this file even if the extent isn't
+	 * for the fork we're rebuilding so that we can reset di_nblocks later.
+	 */
+	rb->nblocks += rec->rm_blockcount;
+
+	/* If this rmap isn't for the fork we want, we're done. */
+	if (rb->whichfork == XFS_DATA_FORK &&
+	    (rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+	if (rb->whichfork == XFS_ATTR_FORK &&
+	    !(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+
+	/* Reject unwritten extents if we don't allow those. */
+	if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten)
+		return -EFSCORRUPTED;
+
+	fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno,
+			rec->rm_startblock);
+
+	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+		rb->old_bmbt_block_count += rec->rm_blockcount;
+		return xfsb_bitmap_set(&rb->old_bmbt_blocks, fsbno,
+				rec->rm_blockcount);
+	}
+
+	return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno,
+			rec->rm_blockcount,
+			rec->rm_flags & XFS_RMAP_UNWRITTEN);
+}
+
+/*
+ * Compare two block mapping records.  We want to sort in order of increasing
+ * file offset.
+ */
+static int
+xrep_bmap_extent_cmp(
+	const void			*a,
+	const void			*b)
+{
+	const struct xfs_bmbt_rec	*ba = a;
+	const struct xfs_bmbt_rec	*bb = b;
+	xfs_fileoff_t			ao = xfs_bmbt_disk_get_startoff(ba);
+	xfs_fileoff_t			bo = xfs_bmbt_disk_get_startoff(bb);
+
+	if (ao > bo)
+		return 1;
+	else if (ao < bo)
+		return -1;
+	return 0;
+}
+
+/*
+ * Sort the bmap extents by fork offset or else the records will be in the
+ * wrong order.  Ensure there are no overlaps in the file offset ranges.
+ */
+STATIC int
+xrep_bmap_sort_records(
+	struct xrep_bmap	*rb)
+{
+	struct xfs_bmbt_irec	irec;
+	xfs_fileoff_t		next_off = 0;
+	xfarray_idx_t		array_cur;
+	int			error;
+
+	error = xfarray_sort(rb->bmap_records, xrep_bmap_extent_cmp,
+			XFARRAY_SORT_KILLABLE);
+	if (error)
+		return error;
+
+	foreach_xfarray_idx(rb->bmap_records, array_cur) {
+		struct xfs_bmbt_rec	rec;
+
+		if (xchk_should_terminate(rb->sc, &error))
+			return error;
+
+		error = xfarray_load(rb->bmap_records, array_cur, &rec);
+		if (error)
+			return error;
+
+		xfs_bmbt_disk_get_all(&rec, &irec);
+
+		if (irec.br_startoff < next_off)
+			return -EFSCORRUPTED;
+
+		next_off = irec.br_startoff + irec.br_blockcount;
+	}
+
+	return 0;
+}
+
+/* Scan one AG for reverse mappings that we can turn into extent maps. */
+STATIC int
+xrep_bmap_scan_ag(
+	struct xrep_bmap	*rb,
+	struct xfs_perag	*pag)
+{
+	struct xfs_scrub	*sc = rb->sc;
+	int			error;
+
+	error = xrep_ag_init(sc, pag, &sc->sa);
+	if (error)
+		return error;
+
+	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_bmap_walk_rmap, rb);
+	xchk_ag_free(sc, &sc->sa);
+	return error;
+}
+
+/* Find the delalloc extents from the old incore extent tree. */
+STATIC int
+xrep_bmap_find_delalloc(
+	struct xrep_bmap	*rb)
+{
+	struct xfs_bmbt_irec	irec;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_rec	rbe;
+	struct xfs_inode	*ip = rb->sc->ip;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, rb->whichfork);
+	int			error = 0;
+
+	/*
+	 * Skip this scan if we don't expect to find delayed allocation
+	 * reservations in this fork.
+	 */
+	if (rb->whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0)
+		return 0;
+
+	for_each_xfs_iext(ifp, &icur, &irec) {
+		if (!isnullstartblock(irec.br_startblock))
+			continue;
+
+		xfs_bmbt_disk_set_all(&rbe, &irec);
+
+		trace_xrep_bmap_found(ip, rb->whichfork, &irec);
+
+		if (xchk_should_terminate(rb->sc, &error))
+			return error;
+
+		error = xfarray_append(rb->bmap_records, &rbe);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Collect block mappings for this fork of this inode and decide if we have
+ * enough space to rebuild.  Caller is responsible for cleaning up the list if
+ * anything goes wrong.
+ */
+STATIC int
+xrep_bmap_find_mappings(
+	struct xrep_bmap	*rb)
+{
+	struct xfs_scrub	*sc = rb->sc;
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	int			error = 0;
+
+	/* Iterate the rmaps for extents. */
+	for_each_perag(sc->mp, agno, pag) {
+		error = xrep_bmap_scan_ag(rb, pag);
+		if (error) {
+			xfs_perag_rele(pag);
+			return error;
+		}
+	}
+
+	return xrep_bmap_find_delalloc(rb);
+}
+
+/* Retrieve real extent mappings for bulk loading the bmap btree. */
+STATIC int
+xrep_bmap_get_records(
+	struct xfs_btree_cur	*cur,
+	unsigned int		idx,
+	struct xfs_btree_block	*block,
+	unsigned int		nr_wanted,
+	void			*priv)
+{
+	struct xfs_bmbt_rec	rec;
+	struct xfs_bmbt_irec	*irec = &cur->bc_rec.b;
+	struct xrep_bmap	*rb = priv;
+	union xfs_btree_rec	*block_rec;
+	unsigned int		loaded;
+	int			error;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		do {
+			error = xfarray_load(rb->bmap_records, rb->array_cur++,
+					&rec);
+			if (error)
+				return error;
+
+			xfs_bmbt_disk_get_all(&rec, irec);
+		} while (isnullstartblock(irec->br_startblock));
+
+		block_rec = xfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_bmap_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_bmap        *rb = priv;
+
+	return xrep_newbt_claim_block(cur, &rb->new_bmapbt, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_bmap_iroot_size(
+	struct xfs_btree_cur	*cur,
+	unsigned int		level,
+	unsigned int		nr_this_level,
+	void			*priv)
+{
+	ASSERT(level > 0);
+
+	return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level);
+}
+
+/* Update the inode counters. */
+STATIC int
+xrep_bmap_reset_counters(
+	struct xrep_bmap	*rb)
+{
+	struct xfs_scrub	*sc = rb->sc;
+	struct xbtree_ifakeroot	*ifake = &rb->new_bmapbt.ifake;
+	int64_t			delta;
+
+	if (rb->reflink_scan == RLS_SET_IFLAG)
+		sc->ip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+
+	/*
+	 * Update the inode block counts to reflect the extents we found in the
+	 * rmapbt.
+	 */
+	delta = ifake->if_blocks - rb->old_bmbt_block_count;
+	sc->ip->i_nblocks = rb->nblocks + delta;
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+
+	/*
+	 * Adjust the quota counts by the difference in size between the old
+	 * and new bmbt.
+	 */
+	xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta);
+	return 0;
+}
+
+/*
+ * Create a new iext tree and load it with block mappings.  If the inode is
+ * in extents format, that's all we need to do to commit the new mappings.
+ * If it is in btree format, this takes care of preloading the incore tree.
+ */
+STATIC int
+xrep_bmap_extents_load(
+	struct xrep_bmap	*rb)
+{
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	irec;
+	struct xfs_ifork	*ifp = rb->new_bmapbt.ifake.if_fork;
+	xfarray_idx_t		array_cur;
+	int			error;
+
+	ASSERT(ifp->if_bytes == 0);
+
+	/* Add all the mappings (incl. delalloc) to the incore extent tree. */
+	xfs_iext_first(ifp, &icur);
+	foreach_xfarray_idx(rb->bmap_records, array_cur) {
+		struct xfs_bmbt_rec	rec;
+
+		error = xfarray_load(rb->bmap_records, array_cur, &rec);
+		if (error)
+			return error;
+
+		xfs_bmbt_disk_get_all(&rec, &irec);
+
+		xfs_iext_insert_raw(ifp, &icur, &irec);
+		if (!isnullstartblock(irec.br_startblock))
+			ifp->if_nextents++;
+
+		xfs_iext_next(ifp, &icur);
+	}
+
+	return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork,
+			ifp->if_nextents);
+}
+
+/*
+ * Reserve new btree blocks, bulk load the bmap records into the ondisk btree,
+ * and load the incore extent tree.
+ */
+STATIC int
+xrep_bmap_btree_load(
+	struct xrep_bmap	*rb,
+	struct xfs_btree_cur	*bmap_cur)
+{
+	struct xfs_scrub	*sc = rb->sc;
+	int			error;
+
+	/* Compute how many blocks we'll need. */
+	error = xfs_btree_bload_compute_geometry(bmap_cur,
+			&rb->new_bmapbt.bload, rb->real_mappings);
+	if (error)
+		return error;
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	/*
+	 * Guess how many blocks we're going to need to rebuild an entire bmap
+	 * from the number of extents we found, and pump up our transaction to
+	 * have sufficient block reservation.  We're allowed to exceed file
+	 * quota to repair inconsistent metadata.
+	 */
+	error = xfs_trans_reserve_more_inode(sc->tp, sc->ip,
+			rb->new_bmapbt.bload.nr_blocks, 0, true);
+	if (error)
+		return error;
+
+	/* Reserve the space we'll need for the new btree. */
+	error = xrep_newbt_alloc_blocks(&rb->new_bmapbt,
+			rb->new_bmapbt.bload.nr_blocks);
+	if (error)
+		return error;
+
+	/* Add all observed bmap records. */
+	rb->array_cur = XFARRAY_CURSOR_INIT;
+	error = xfs_btree_bload(bmap_cur, &rb->new_bmapbt.bload, rb);
+	if (error)
+		return error;
+
+	/*
+	 * Load the new bmap records into the new incore extent tree to
+	 * preserve delalloc reservations for regular files.  The directory
+	 * code loads the extent tree during xfs_dir_open and assumes
+	 * thereafter that it remains loaded, so we must not violate that
+	 * assumption.
+	 */
+	return xrep_bmap_extents_load(rb);
+}
+
+/*
+ * Use the collected bmap information to stage a new bmap fork.  If this is
+ * successful we'll return with the new fork information logged to the repair
+ * transaction but not yet committed.  The caller must ensure that the inode
+ * is joined to the transaction; the inode will be joined to a clean
+ * transaction when the function returns.
+ */
+STATIC int
+xrep_bmap_build_new_fork(
+	struct xrep_bmap	*rb)
+{
+	struct xfs_owner_info	oinfo;
+	struct xfs_scrub	*sc = rb->sc;
+	struct xfs_btree_cur	*bmap_cur;
+	struct xbtree_ifakeroot	*ifake = &rb->new_bmapbt.ifake;
+	int			error;
+
+	error = xrep_bmap_sort_records(rb);
+	if (error)
+		return error;
+
+	/*
+	 * Prepare to construct the new fork by initializing the new btree
+	 * structure and creating a fake ifork in the ifakeroot structure.
+	 */
+	xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork);
+	error = xrep_newbt_init_inode(&rb->new_bmapbt, sc, rb->whichfork,
+			&oinfo);
+	if (error)
+		return error;
+
+	rb->new_bmapbt.bload.get_records = xrep_bmap_get_records;
+	rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block;
+	rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size;
+	bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake);
+
+	/*
+	 * Figure out the size and format of the new fork, then fill it with
+	 * all the bmap records we've found.  Join the inode to the transaction
+	 * so that we can roll the transaction while holding the inode locked.
+	 */
+	if (rb->real_mappings <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) {
+		ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS;
+		error = xrep_bmap_extents_load(rb);
+	} else {
+		ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE;
+		error = xrep_bmap_btree_load(rb, bmap_cur);
+	}
+	if (error)
+		goto err_cur;
+
+	/*
+	 * Install the new fork in the inode.  After this point the old mapping
+	 * data are no longer accessible and the new tree is live.  We delete
+	 * the cursor immediately after committing the staged root because the
+	 * staged fork might be in extents format.
+	 */
+	xfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork);
+	xfs_btree_del_cursor(bmap_cur, 0);
+
+	/* Reset the inode counters now that we've changed the fork. */
+	error = xrep_bmap_reset_counters(rb);
+	if (error)
+		goto err_newbt;
+
+	/* Dispose of any unused blocks and the accounting information. */
+	error = xrep_newbt_commit(&rb->new_bmapbt);
+	if (error)
+		return error;
+
+	return xrep_roll_trans(sc);
+
+err_cur:
+	if (bmap_cur)
+		xfs_btree_del_cursor(bmap_cur, error);
+err_newbt:
+	xrep_newbt_cancel(&rb->new_bmapbt);
+	return error;
+}
+
+/*
+ * Now that we've logged the new inode btree, invalidate all of the old blocks
+ * and free them, if there were any.
+ */
+STATIC int
+xrep_bmap_remove_old_tree(
+	struct xrep_bmap	*rb)
+{
+	struct xfs_scrub	*sc = rb->sc;
+	struct xfs_owner_info	oinfo;
+
+	/* Free the old bmbt blocks if they're not in use. */
+	xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork);
+	return xrep_reap_fsblocks(sc, &rb->old_bmbt_blocks, &oinfo);
+}
+
+/* Check for garbage inputs.  Returns -ECANCELED if there's nothing to do. */
+STATIC int
+xrep_bmap_check_inputs(
+	struct xfs_scrub	*sc,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+	ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
+
+	if (!xfs_has_rmapbt(sc->mp))
+		return -EOPNOTSUPP;
+
+	/* No fork means nothing to rebuild. */
+	if (!ifp)
+		return -ECANCELED;
+
+	/*
+	 * We only know how to repair extent mappings, which is to say that we
+	 * only support extents and btree fork format.  Repairs to a local
+	 * format fork require a higher level repair function, so we do not
+	 * have any work to do here.
+	 */
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_DEV:
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_UUID:
+		return -ECANCELED;
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		break;
+	default:
+		return -EFSCORRUPTED;
+	}
+
+	if (whichfork == XFS_ATTR_FORK)
+		return 0;
+
+	/* Only files, symlinks, and directories get to have data forks. */
+	switch (VFS_I(sc->ip)->i_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		/* ok */
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Don't know how to rebuild realtime data forks. */
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/* Set up the initial state of the reflink scan. */
+static inline enum reflink_scan_state
+xrep_bmap_init_reflink_scan(
+	struct xfs_scrub	*sc,
+	int			whichfork)
+{
+	/* cannot share on non-reflink filesystem */
+	if (!xfs_has_reflink(sc->mp))
+		return RLS_IRRELEVANT;
+
+	/* preserve flag if it's already set */
+	if (xfs_is_reflink_inode(sc->ip))
+		return RLS_SET_IFLAG;
+
+	/* can only share regular files */
+	if (!S_ISREG(VFS_I(sc->ip)->i_mode))
+		return RLS_IRRELEVANT;
+
+	/* cannot share attr fork extents */
+	if (whichfork != XFS_DATA_FORK)
+		return RLS_IRRELEVANT;
+
+	/* cannot share realtime extents */
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		return RLS_IRRELEVANT;
+
+	return RLS_UNKNOWN;
+}
+
+/* Repair an inode fork. */
+int
+xrep_bmap(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	bool			allow_unwritten)
+{
+	struct xrep_bmap	*rb;
+	char			*descr;
+	unsigned int		max_bmbt_recs;
+	bool			large_extcount;
+	int			error = 0;
+
+	error = xrep_bmap_check_inputs(sc, whichfork);
+	if (error == -ECANCELED)
+		return 0;
+	if (error)
+		return error;
+
+	rb = kzalloc(sizeof(struct xrep_bmap), XCHK_GFP_FLAGS);
+	if (!rb)
+		return -ENOMEM;
+	rb->sc = sc;
+	rb->whichfork = whichfork;
+	rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork);
+	rb->allow_unwritten = allow_unwritten;
+
+	/* Set up enough storage to handle the max records for this fork. */
+	large_extcount = xfs_has_large_extent_counts(sc->mp);
+	max_bmbt_recs = xfs_iext_max_nextents(large_extcount, whichfork);
+	descr = xchk_xfile_ino_descr(sc, "%s fork mapping records",
+			whichfork == XFS_DATA_FORK ? "data" : "attr");
+	error = xfarray_create(descr, max_bmbt_recs,
+			sizeof(struct xfs_bmbt_rec), &rb->bmap_records);
+	kfree(descr);
+	if (error)
+		goto out_rb;
+
+	/* Collect all reverse mappings for this fork's extents. */
+	xfsb_bitmap_init(&rb->old_bmbt_blocks);
+	error = xrep_bmap_find_mappings(rb);
+	if (error)
+		goto out_bitmap;
+
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	/* Rebuild the bmap information. */
+	error = xrep_bmap_build_new_fork(rb);
+	if (error)
+		goto out_bitmap;
+
+	/* Kill the old tree. */
+	error = xrep_bmap_remove_old_tree(rb);
+	if (error)
+		goto out_bitmap;
+
+out_bitmap:
+	xfsb_bitmap_destroy(&rb->old_bmbt_blocks);
+	xfarray_destroy(rb->bmap_records);
+out_rb:
+	kfree(rb);
+	return error;
+}
+
+/* Repair an inode's data fork. */
+int
+xrep_bmap_data(
+	struct xfs_scrub	*sc)
+{
+	return xrep_bmap(sc, XFS_DATA_FORK, true);
+}
+
+/* Repair an inode's attr fork. */
+int
+xrep_bmap_attr(
+	struct xfs_scrub	*sc)
+{
+	return xrep_bmap(sc, XFS_ATTR_FORK, false);
+}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index de24532fe083..81f2b96bb5a7 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -25,6 +25,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
+#include "xfs_dir2_priv.h"
 #include "xfs_attr.h"
 #include "xfs_reflink.h"
 #include "xfs_ag.h"
@@ -604,6 +605,7 @@ xchk_ag_free(
 	struct xchk_ag		*sa)
 {
 	xchk_ag_btcur_free(sa);
+	xrep_reset_perag_resv(sc);
 	if (sa->agf_bp) {
 		xfs_trans_brelse(sc->tp, sa->agf_bp);
 		sa->agf_bp = NULL;
@@ -733,6 +735,8 @@ xchk_iget(
 	xfs_ino_t		inum,
 	struct xfs_inode	**ipp)
 {
+	ASSERT(sc->tp != NULL);
+
 	return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
 }
 
@@ -816,6 +820,26 @@ again:
 	return 0;
 }
 
+#ifdef CONFIG_XFS_QUOTA
+/*
+ * Try to attach dquots to this inode if we think we might want to repair it.
+ * Callers must not hold any ILOCKs.  If the dquots are broken and cannot be
+ * attached, a quotacheck will be scheduled.
+ */
+int
+xchk_ino_dqattach(
+	struct xfs_scrub	*sc)
+{
+	ASSERT(sc->tp != NULL);
+	ASSERT(sc->ip != NULL);
+
+	if (!xchk_could_repair(sc))
+		return 0;
+
+	return xrep_ino_dqattach(sc);
+}
+#endif
+
 /* Install an inode that we opened by handle for scrubbing. */
 int
 xchk_install_handle_inode(
@@ -882,8 +906,8 @@ xchk_iget_for_scrubbing(
 	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
 		return -ENOENT;
 
-	/* Try a regular untrusted iget. */
-	error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+	/* Try a safe untrusted iget. */
+	error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
 	if (!error)
 		return xchk_install_handle_inode(sc, ip);
 	if (error == -ENOENT)
@@ -1027,6 +1051,11 @@ xchk_setup_inode_contents(
 	error = xchk_trans_alloc(sc, resblks);
 	if (error)
 		goto out;
+
+	error = xchk_ino_dqattach(sc);
+	if (error)
+		goto out;
+
 	xchk_ilock(sc, XFS_ILOCK_EXCL);
 out:
 	/* scrub teardown will unlock and release the inode for us */
@@ -1132,6 +1161,7 @@ xchk_metadata_inode_subtype(
 	unsigned int		scrub_type)
 {
 	__u32			smtype = sc->sm->sm_type;
+	unsigned int		sick_mask = sc->sick_mask;
 	int			error;
 
 	sc->sm->sm_type = scrub_type;
@@ -1149,6 +1179,7 @@ xchk_metadata_inode_subtype(
 		break;
 	}
 
+	sc->sick_mask = sick_mask;
 	sc->sm->sm_type = smtype;
 	return error;
 }
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index cabdc0e16838..da09580b454a 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -103,9 +103,15 @@ xchk_setup_rtsummary(struct xfs_scrub *sc)
 }
 #endif
 #ifdef CONFIG_XFS_QUOTA
+int xchk_ino_dqattach(struct xfs_scrub *sc);
 int xchk_setup_quota(struct xfs_scrub *sc);
 #else
 static inline int
+xchk_ino_dqattach(struct xfs_scrub *sc)
+{
+	return 0;
+}
+static inline int
 xchk_setup_quota(struct xfs_scrub *sc)
 {
 	return -ENOENT;
@@ -151,6 +157,11 @@ void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags);
 
 void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
 
+/*
+ * Grab the inode at @inum.  The caller must have created a scrub transaction
+ * so that we can confirm the inumber by walking the inobt and not deadlock on
+ * a loop in the inobt.
+ */
 int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
 int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum,
 		struct xfs_buf **agi_bpp, struct xfs_inode **ipp);
@@ -158,6 +169,26 @@ void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip);
 int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
 
 /*
+ * Safe version of (untrusted) xchk_iget that uses an empty transaction to
+ * avoid deadlocking on loops in the inobt.  This should only be used in a
+ * scrub or repair setup routine, and only prior to grabbing a transaction.
+ */
+static inline int
+xchk_iget_safe(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp)
+{
+	int	error;
+
+	ASSERT(sc->tp == NULL);
+
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		return error;
+	error = xchk_iget(sc, inum, ipp);
+	xchk_trans_cancel(sc);
+	return error;
+}
+
+/*
  * Don't bother cross-referencing if we already found corruption or cross
  * referencing discrepancies.
  */
@@ -167,6 +198,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
 			       XFS_SCRUB_OFLAG_XCORRUPT);
 }
 
+bool xchk_dir_looks_zapped(struct xfs_inode *dp);
+
 #ifdef CONFIG_XFS_ONLINE_REPAIR
 /* Decide if a repair is required. */
 static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
@@ -175,8 +208,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
 			       XFS_SCRUB_OFLAG_XCORRUPT |
 			       XFS_SCRUB_OFLAG_PREEN);
 }
+
+/*
+ * "Should we prepare for a repair?"
+ *
+ * Return true if the caller permits us to repair metadata and we're not
+ * setting up for a post-repair evaluation.
+ */
+static inline bool xchk_could_repair(const struct xfs_scrub *sc)
+{
+	return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+		!(sc->flags & XREP_ALREADY_FIXED);
+}
 #else
 # define xchk_needs_repair(sc)		(false)
+# define xchk_could_repair(sc)		(false)
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
 int xchk_metadata_inode_forks(struct xfs_scrub *sc);
@@ -188,6 +234,16 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
 #define xchk_xfile_descr(sc, fmt, ...) \
 	kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \
 			(sc)->mp->m_super->s_id, ##__VA_ARGS__)
+#define xchk_xfile_ag_descr(sc, fmt, ...) \
+	kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \
+			(sc)->mp->m_super->s_id, \
+			(sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \
+			##__VA_ARGS__)
+#define xchk_xfile_ino_descr(sc, fmt, ...) \
+	kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \
+			(sc)->mp->m_super->s_id, \
+			(sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \
+			##__VA_ARGS__)
 
 /*
  * Setting up a hook to wait for intents to drain is costly -- we have to take
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
new file mode 100644
index 000000000000..1e82c727af8e
--- /dev/null
+++ b/fs/xfs/scrub/cow_repair.c
@@ -0,0 +1,614 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
+#include "xfs_icache.h"
+#include "xfs_refcount_btree.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/off_bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/reap.h"
+
+/*
+ * CoW Fork Mapping Repair
+ * =======================
+ *
+ * Although CoW staging extents are owned by incore CoW inode forks, on disk
+ * they are owned by the refcount btree.  The ondisk metadata does not record
+ * any ownership information, which limits what we can do to repair the
+ * mappings in the CoW fork.  At most, we can replace ifork mappings that lack
+ * an entry in the refcount btree or are described by a reverse mapping record
+ * whose owner is not OWN_COW.
+ *
+ * Replacing extents is also tricky -- we can't touch written CoW fork extents
+ * since they are undergoing writeback, and delalloc extents do not require
+ * repair since they only exist incore.  Hence the most we can do is find the
+ * bad parts of unwritten mappings, allocate a replacement set of blocks, and
+ * replace the incore mapping.  We use the regular reaping process to unmap
+ * or free the discarded blocks, as appropriate.
+ */
+struct xrep_cow {
+	struct xfs_scrub	*sc;
+
+	/* Bitmap of file offset ranges that need replacing. */
+	struct xoff_bitmap	bad_fileoffs;
+
+	/* Bitmap of fsblocks that were removed from the CoW fork. */
+	struct xfsb_bitmap	old_cowfork_fsblocks;
+
+	/* CoW fork mappings used to scan for bad CoW staging extents. */
+	struct xfs_bmbt_irec	irec;
+
+	/* refcount btree block number of irec.br_startblock */
+	unsigned int		irec_startbno;
+
+	/* refcount btree block number of the next refcount record we expect */
+	unsigned int		next_bno;
+};
+
+/* CoW staging extent. */
+struct xrep_cow_extent {
+	xfs_fsblock_t		fsbno;
+	xfs_extlen_t		len;
+};
+
+/*
+ * Mark the part of the file range that corresponds to the given physical
+ * space.  Caller must ensure that the physical range is within xc->irec.
+ */
+STATIC int
+xrep_cow_mark_file_range(
+	struct xrep_cow		*xc,
+	xfs_fsblock_t		startblock,
+	xfs_filblks_t		blockcount)
+{
+	xfs_fileoff_t		startoff;
+
+	startoff = xc->irec.br_startoff +
+				(startblock - xc->irec.br_startblock);
+
+	trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff,
+			blockcount);
+
+	return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount);
+}
+
+/*
+ * Trim @src to fit within the CoW fork mapping being examined, and put the
+ * result in @dst.
+ */
+static inline void
+xrep_cow_trim_refcount(
+	struct xrep_cow			*xc,
+	struct xfs_refcount_irec	*dst,
+	const struct xfs_refcount_irec	*src)
+{
+	unsigned int			adj;
+
+	memcpy(dst, src, sizeof(*dst));
+
+	if (dst->rc_startblock < xc->irec_startbno) {
+		adj = xc->irec_startbno - dst->rc_startblock;
+		dst->rc_blockcount -= adj;
+		dst->rc_startblock += adj;
+	}
+
+	if (dst->rc_startblock + dst->rc_blockcount >
+	    xc->irec_startbno + xc->irec.br_blockcount) {
+		adj = (dst->rc_startblock + dst->rc_blockcount) -
+		      (xc->irec_startbno + xc->irec.br_blockcount);
+		dst->rc_blockcount -= adj;
+	}
+}
+
+/* Mark any shared CoW staging extents. */
+STATIC int
+xrep_cow_mark_shared_staging(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_refcount_irec	*rec,
+	void				*priv)
+{
+	struct xrep_cow			*xc = priv;
+	struct xfs_refcount_irec	rrec;
+	xfs_fsblock_t			fsbno;
+
+	if (!xfs_refcount_check_domain(rec) ||
+	    rec->rc_domain != XFS_REFC_DOMAIN_SHARED)
+		return -EFSCORRUPTED;
+
+	xrep_cow_trim_refcount(xc, &rrec, rec);
+
+	fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+			rrec.rc_startblock);
+	return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
+}
+
+/*
+ * Mark any portion of the CoW fork file offset range where there is not a CoW
+ * staging extent record in the refcountbt, and keep a record of where we did
+ * find correct refcountbt records.  Staging records are always cleaned out at
+ * mount time, so any two inodes trying to map the same staging area would have
+ * already taken the fs down due to refcount btree verifier errors.  Hence this
+ * inode should be the sole creator of the staging extent records ondisk.
+ */
+STATIC int
+xrep_cow_mark_missing_staging(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_refcount_irec	*rec,
+	void				*priv)
+{
+	struct xrep_cow			*xc = priv;
+	struct xfs_refcount_irec	rrec;
+	int				error;
+
+	if (!xfs_refcount_check_domain(rec) ||
+	    rec->rc_domain != XFS_REFC_DOMAIN_COW)
+		return -EFSCORRUPTED;
+
+	xrep_cow_trim_refcount(xc, &rrec, rec);
+
+	if (xc->next_bno >= rrec.rc_startblock)
+		goto next;
+
+	error = xrep_cow_mark_file_range(xc,
+			XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+				       xc->next_bno),
+			rrec.rc_startblock - xc->next_bno);
+	if (error)
+		return error;
+
+next:
+	xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount;
+	return 0;
+}
+
+/*
+ * Mark any area that does not correspond to a CoW staging rmap.  These are
+ * cross-linked areas that must be avoided.
+ */
+STATIC int
+xrep_cow_mark_missing_staging_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_cow			*xc = priv;
+	xfs_fsblock_t			fsbno;
+	xfs_agblock_t			rec_bno;
+	xfs_extlen_t			rec_len;
+	unsigned int			adj;
+
+	if (rec->rm_owner == XFS_RMAP_OWN_COW)
+		return 0;
+
+	rec_bno = rec->rm_startblock;
+	rec_len = rec->rm_blockcount;
+	if (rec_bno < xc->irec_startbno) {
+		adj = xc->irec_startbno - rec_bno;
+		rec_len -= adj;
+		rec_bno += adj;
+	}
+
+	if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) {
+		adj = (rec_bno + rec_len) -
+		      (xc->irec_startbno + xc->irec.br_blockcount);
+		rec_len -= adj;
+	}
+
+	fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
+	return xrep_cow_mark_file_range(xc, fsbno, rec_len);
+}
+
+/*
+ * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
+ * extent and mark the corresponding part of the file range in the bitmap.
+ */
+STATIC int
+xrep_cow_find_bad(
+	struct xrep_cow			*xc)
+{
+	struct xfs_refcount_irec	rc_low = { 0 };
+	struct xfs_refcount_irec	rc_high = { 0 };
+	struct xfs_rmap_irec		rm_low = { 0 };
+	struct xfs_rmap_irec		rm_high = { 0 };
+	struct xfs_perag		*pag;
+	struct xfs_scrub		*sc = xc->sc;
+	xfs_agnumber_t			agno;
+	int				error;
+
+	agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock);
+	xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock);
+
+	pag = xfs_perag_get(sc->mp, agno);
+	if (!pag)
+		return -EFSCORRUPTED;
+
+	error = xrep_ag_init(sc, pag, &sc->sa);
+	if (error)
+		goto out_pag;
+
+	/* Mark any CoW fork extents that are shared. */
+	rc_low.rc_startblock = xc->irec_startbno;
+	rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+	rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
+	error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
+			xrep_cow_mark_shared_staging, xc);
+	if (error)
+		goto out_sa;
+
+	/* Make sure there are CoW staging extents for the whole mapping. */
+	rc_low.rc_startblock = xc->irec_startbno;
+	rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+	rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
+	xc->next_bno = xc->irec_startbno;
+	error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
+			xrep_cow_mark_missing_staging, xc);
+	if (error)
+		goto out_sa;
+
+	if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
+		error = xrep_cow_mark_file_range(xc,
+				XFS_AGB_TO_FSB(sc->mp, pag->pag_agno,
+					       xc->next_bno),
+				xc->irec_startbno + xc->irec.br_blockcount -
+				xc->next_bno);
+		if (error)
+			goto out_sa;
+	}
+
+	/* Mark any area has an rmap that isn't a COW staging extent. */
+	rm_low.rm_startblock = xc->irec_startbno;
+	memset(&rm_high, 0xFF, sizeof(rm_high));
+	rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+	error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high,
+			xrep_cow_mark_missing_staging_rmap, xc);
+	if (error)
+		goto out_sa;
+
+	/*
+	 * If userspace is forcing us to rebuild the CoW fork or someone turned
+	 * on the debugging knob, replace everything in the CoW fork.
+	 */
+	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+	    XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+		error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
+				xc->irec.br_blockcount);
+		if (error)
+			return error;
+	}
+
+out_sa:
+	xchk_ag_free(sc, &sc->sa);
+out_pag:
+	xfs_perag_put(pag);
+	return 0;
+}
+
+/*
+ * Allocate a replacement CoW staging extent of up to the given number of
+ * blocks, and fill out the mapping.
+ */
+STATIC int
+xrep_cow_alloc(
+	struct xfs_scrub	*sc,
+	xfs_extlen_t		maxlen,
+	struct xrep_cow_extent	*repl)
+{
+	struct xfs_alloc_arg	args = {
+		.tp		= sc->tp,
+		.mp		= sc->mp,
+		.oinfo		= XFS_RMAP_OINFO_SKIP_UPDATE,
+		.minlen		= 1,
+		.maxlen		= maxlen,
+		.prod		= 1,
+		.resv		= XFS_AG_RESV_NONE,
+		.datatype	= XFS_ALLOC_USERDATA,
+	};
+	int			error;
+
+	error = xfs_trans_reserve_more(sc->tp, maxlen, 0);
+	if (error)
+		return error;
+
+	error = xfs_alloc_vextent_start_ag(&args,
+			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino));
+	if (error)
+		return error;
+	if (args.fsbno == NULLFSBLOCK)
+		return -ENOSPC;
+
+	xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
+
+	repl->fsbno = args.fsbno;
+	repl->len = args.len;
+	return 0;
+}
+
+/*
+ * Look up the current CoW fork mapping so that we only allocate enough to
+ * replace a single mapping.  If we don't find a mapping that covers the start
+ * of the file range, or we find a delalloc or written extent, something is
+ * seriously wrong, since we didn't drop the ILOCK.
+ */
+static inline int
+xrep_cow_find_mapping(
+	struct xrep_cow		*xc,
+	struct xfs_iext_cursor	*icur,
+	xfs_fileoff_t		startoff,
+	struct xfs_bmbt_irec	*got)
+{
+	struct xfs_inode	*ip = xc->sc->ip;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+
+	if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got))
+		goto bad;
+
+	if (got->br_startoff > startoff)
+		goto bad;
+
+	if (got->br_blockcount == 0)
+		goto bad;
+
+	if (isnullstartblock(got->br_startblock))
+		goto bad;
+
+	if (xfs_bmap_is_written_extent(got))
+		goto bad;
+
+	return 0;
+bad:
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
+#define REPLACE_LEFT_SIDE	(1U << 0)
+#define REPLACE_RIGHT_SIDE	(1U << 1)
+
+/*
+ * Given a CoW fork mapping @got and a replacement mapping @repl, remap the
+ * beginning of @got with the space described by @rep.
+ */
+static inline void
+xrep_cow_replace_mapping(
+	struct xfs_inode		*ip,
+	struct xfs_iext_cursor		*icur,
+	const struct xfs_bmbt_irec	*got,
+	const struct xrep_cow_extent	*repl)
+{
+	struct xfs_bmbt_irec		new = *got; /* struct copy */
+
+	ASSERT(repl->len > 0);
+	ASSERT(!isnullstartblock(got->br_startblock));
+
+	trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len);
+
+	if (got->br_blockcount == repl->len) {
+		/*
+		 * The new extent is a complete replacement for the existing
+		 * extent.  Update the COW fork record.
+		 */
+		new.br_startblock = repl->fsbno;
+		xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
+		return;
+	}
+
+	/*
+	 * The new extent can replace the beginning of the COW fork record.
+	 * Move the left side of @got upwards, then insert the new record.
+	 */
+	new.br_startoff += repl->len;
+	new.br_startblock += repl->len;
+	new.br_blockcount -= repl->len;
+	xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
+
+	new.br_startoff = got->br_startoff;
+	new.br_startblock = repl->fsbno;
+	new.br_blockcount = repl->len;
+	xfs_iext_insert(ip, icur, &new, BMAP_COWFORK);
+}
+
+/*
+ * Replace the unwritten CoW staging extent backing the given file range with a
+ * new space extent that isn't as problematic.
+ */
+STATIC int
+xrep_cow_replace_range(
+	struct xrep_cow		*xc,
+	xfs_fileoff_t		startoff,
+	xfs_extlen_t		*blockcount)
+{
+	struct xfs_iext_cursor	icur;
+	struct xrep_cow_extent	repl;
+	struct xfs_bmbt_irec	got;
+	struct xfs_scrub	*sc = xc->sc;
+	xfs_fileoff_t		nextoff;
+	xfs_extlen_t		alloc_len;
+	int			error;
+
+	/*
+	 * Put the existing CoW fork mapping in @got.  If @got ends before
+	 * @rep, truncate @rep so we only replace one extent mapping at a time.
+	 */
+	error = xrep_cow_find_mapping(xc, &icur, startoff, &got);
+	if (error)
+		return error;
+	nextoff = min(startoff + *blockcount,
+		      got.br_startoff + got.br_blockcount);
+
+	/*
+	 * Allocate a replacement extent.  If we don't fill all the blocks,
+	 * shorten the quantity that will be deleted in this step.
+	 */
+	alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
+			  nextoff - startoff);
+	error = xrep_cow_alloc(sc, alloc_len, &repl);
+	if (error)
+		return error;
+
+	/*
+	 * Replace the old mapping with the new one, and commit the metadata
+	 * changes made so far.
+	 */
+	xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl);
+
+	xfs_inode_set_cowblocks_tag(sc->ip);
+	error = xfs_defer_finish(&sc->tp);
+	if (error)
+		return error;
+
+	/* Note the old CoW staging extents; we'll reap them all later. */
+	error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
+			repl.len);
+	if (error)
+		return error;
+
+	*blockcount = repl.len;
+	return 0;
+}
+
+/*
+ * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc
+ * reservation.
+ */
+STATIC int
+xrep_cow_replace(
+	uint64_t		startoff,
+	uint64_t		blockcount,
+	void			*priv)
+{
+	struct xrep_cow		*xc = priv;
+	int			error = 0;
+
+	while (blockcount > 0) {
+		xfs_extlen_t	len = min_t(xfs_filblks_t, blockcount,
+					    XFS_MAX_BMBT_EXTLEN);
+
+		error = xrep_cow_replace_range(xc, startoff, &len);
+		if (error)
+			break;
+
+		blockcount -= len;
+		startoff += len;
+	}
+
+	return error;
+}
+
+/*
+ * Repair an inode's CoW fork.  The CoW fork is an in-core structure, so
+ * there's no btree to rebuid.  Instead, we replace any mappings that are
+ * cross-linked or lack ondisk CoW fork records in the refcount btree.
+ */
+int
+xrep_bmap_cow(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_cow		*xc;
+	struct xfs_iext_cursor	icur;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK);
+	int			error;
+
+	if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp))
+		return -EOPNOTSUPP;
+
+	if (!ifp)
+		return 0;
+
+	/* realtime files aren't supported yet */
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		return -EOPNOTSUPP;
+
+	/*
+	 * If we're somehow not in extents format, then reinitialize it to
+	 * an empty extent mapping fork and exit.
+	 */
+	if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+		ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+		ifp->if_nextents = 0;
+		return 0;
+	}
+
+	xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS);
+	if (!xc)
+		return -ENOMEM;
+
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	xc->sc = sc;
+	xoff_bitmap_init(&xc->bad_fileoffs);
+	xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
+
+	for_each_xfs_iext(ifp, &icur, &xc->irec) {
+		if (xchk_should_terminate(sc, &error))
+			goto out_bitmap;
+
+		/*
+		 * delalloc reservations only exist incore, so there is no
+		 * ondisk metadata that we can examine.  Hence we leave them
+		 * alone.
+		 */
+		if (isnullstartblock(xc->irec.br_startblock))
+			continue;
+
+		/*
+		 * COW fork extents are only in the written state if writeback
+		 * is actively writing to disk.  We cannot restart the write
+		 * at a different disk address since we've already issued the
+		 * IO, so we leave these alone and hope for the best.
+		 */
+		if (xfs_bmap_is_written_extent(&xc->irec))
+			continue;
+
+		error = xrep_cow_find_bad(xc);
+		if (error)
+			goto out_bitmap;
+	}
+
+	/* Replace any bad unwritten mappings with fresh reservations. */
+	error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc);
+	if (error)
+		goto out_bitmap;
+
+	/*
+	 * Reap as many of the old CoW blocks as we can.  They are owned ondisk
+	 * by the refcount btree, not the inode, so it is correct to treat them
+	 * like inode metadata.
+	 */
+	error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
+			&XFS_RMAP_OINFO_COW);
+	if (error)
+		goto out_bitmap;
+
+out_bitmap:
+	xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
+	xoff_bitmap_destroy(&xc->bad_fileoffs);
+	kmem_free(xc);
+	return error;
+}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 0b491784b759..d86ab51af928 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -15,10 +15,12 @@
 #include "xfs_icache.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
+#include "xfs_health.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/dabtree.h"
 #include "scrub/readdir.h"
+#include "scrub/health.h"
 
 /* Set us up to scrub directories. */
 int
@@ -760,6 +762,11 @@ xchk_directory(
 	if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
 		return -ENOENT;
 
+	if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		return 0;
+	}
+
 	/* Plausible size? */
 	if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) {
 		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -784,7 +791,36 @@ xchk_directory(
 
 	/* Look up every name in this directory by hash. */
 	error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL);
-	if (error == -ECANCELED)
-		error = 0;
-	return error;
+	if (error && error != -ECANCELED)
+		return error;
+
+	/* If the dir is clean, it is clearly not zapped. */
+	xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED);
+	return 0;
+}
+
+/*
+ * Decide if this directory has been zapped to satisfy the inode and ifork
+ * verifiers.  Checking and repairing should be postponed until the directory
+ * is fixed.
+ */
+bool
+xchk_dir_looks_zapped(
+	struct xfs_inode	*dp)
+{
+	/* Repair zapped this dir's data fork a short time ago */
+	if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+		return true;
+
+	/*
+	 * If the dinode repair found a bad data fork, it will reset the fork
+	 * to extents format with zero records and wait for the bmapbtd
+	 * scrubber to reconstruct the block mappings.  Directories always
+	 * contain some content, so this is a clear sign of a zapped directory.
+	 * The state checked by xfs_ifork_zapped is not persisted, so this is
+	 * the secondary strategy if repairs are interrupted by a crash or an
+	 * unmount.
+	 */
+	return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS &&
+	       dp->i_df.if_nextents == 0;
 }
diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c
new file mode 100644
index 000000000000..20c4daedd48d
--- /dev/null
+++ b/fs/xfs/scrub/dqiterate.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_bmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/quota.h"
+#include "scrub/trace.h"
+
+/* Initialize a dquot iteration cursor. */
+void
+xchk_dqiter_init(
+	struct xchk_dqiter	*cursor,
+	struct xfs_scrub	*sc,
+	xfs_dqtype_t		dqtype)
+{
+	cursor->sc = sc;
+	cursor->bmap.br_startoff = NULLFILEOFF;
+	cursor->dqtype = dqtype & XFS_DQTYPE_REC_MASK;
+	cursor->quota_ip = xfs_quota_inode(sc->mp, cursor->dqtype);
+	cursor->id = 0;
+}
+
+/*
+ * Ensure that the cached data fork mapping for the dqiter cursor is fresh and
+ * covers the dquot pointed to by the scan cursor.
+ */
+STATIC int
+xchk_dquot_iter_revalidate_bmap(
+	struct xchk_dqiter	*cursor)
+{
+	struct xfs_quotainfo	*qi = cursor->sc->mp->m_quotainfo;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(cursor->quota_ip,
+								XFS_DATA_FORK);
+	xfs_fileoff_t		fileoff;
+	xfs_dqid_t		this_id = cursor->id;
+	int			nmaps = 1;
+	int			error;
+
+	fileoff = this_id / qi->qi_dqperchunk;
+
+	/*
+	 * If we have a mapping for cursor->id and it's still fresh, there's
+	 * no need to reread the bmbt.
+	 */
+	if (cursor->bmap.br_startoff != NULLFILEOFF &&
+	    cursor->if_seq == ifp->if_seq &&
+	    cursor->bmap.br_startoff + cursor->bmap.br_blockcount > fileoff)
+		return 0;
+
+	/* Look up the data fork mapping for the dquot id of interest. */
+	error = xfs_bmapi_read(cursor->quota_ip, fileoff,
+			XFS_MAX_FILEOFF - fileoff, &cursor->bmap, &nmaps, 0);
+	if (error)
+		return error;
+	if (!nmaps) {
+		ASSERT(nmaps > 0);
+		return -EFSCORRUPTED;
+	}
+	if (cursor->bmap.br_startoff > fileoff) {
+		ASSERT(cursor->bmap.br_startoff == fileoff);
+		return -EFSCORRUPTED;
+	}
+
+	cursor->if_seq = ifp->if_seq;
+	trace_xchk_dquot_iter_revalidate_bmap(cursor, cursor->id);
+	return 0;
+}
+
+/* Advance the dqiter cursor to the next non-sparse region of the quota file. */
+STATIC int
+xchk_dquot_iter_advance_bmap(
+	struct xchk_dqiter	*cursor,
+	uint64_t		*next_ondisk_id)
+{
+	struct xfs_quotainfo	*qi = cursor->sc->mp->m_quotainfo;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(cursor->quota_ip,
+								XFS_DATA_FORK);
+	xfs_fileoff_t		fileoff;
+	uint64_t		next_id;
+	int			nmaps = 1;
+	int			error;
+
+	/* Find the dquot id for the next non-hole mapping. */
+	do {
+		fileoff = cursor->bmap.br_startoff + cursor->bmap.br_blockcount;
+		if (fileoff > XFS_DQ_ID_MAX / qi->qi_dqperchunk) {
+			/* The hole goes beyond the max dquot id, we're done */
+			*next_ondisk_id = -1ULL;
+			return 0;
+		}
+
+		error = xfs_bmapi_read(cursor->quota_ip, fileoff,
+				XFS_MAX_FILEOFF - fileoff, &cursor->bmap,
+				&nmaps, 0);
+		if (error)
+			return error;
+		if (!nmaps) {
+			/* Must have reached the end of the mappings. */
+			*next_ondisk_id = -1ULL;
+			return 0;
+		}
+		if (cursor->bmap.br_startoff > fileoff) {
+			ASSERT(cursor->bmap.br_startoff == fileoff);
+			return -EFSCORRUPTED;
+		}
+	} while (!xfs_bmap_is_real_extent(&cursor->bmap));
+
+	next_id = cursor->bmap.br_startoff * qi->qi_dqperchunk;
+	if (next_id > XFS_DQ_ID_MAX) {
+		/* The hole goes beyond the max dquot id, we're done */
+		*next_ondisk_id = -1ULL;
+		return 0;
+	}
+
+	/* Propose jumping forward to the dquot in the next allocated block. */
+	*next_ondisk_id = next_id;
+	cursor->if_seq = ifp->if_seq;
+	trace_xchk_dquot_iter_advance_bmap(cursor, *next_ondisk_id);
+	return 0;
+}
+
+/*
+ * Find the id of the next highest incore dquot.  Normally this will correspond
+ * exactly with the quota file block mappings, but repair might have erased a
+ * mapping because it was crosslinked; in that case, we need to re-allocate the
+ * space so that we can reset q_blkno.
+ */
+STATIC void
+xchk_dquot_iter_advance_incore(
+	struct xchk_dqiter	*cursor,
+	uint64_t		*next_incore_id)
+{
+	struct xfs_quotainfo	*qi = cursor->sc->mp->m_quotainfo;
+	struct radix_tree_root	*tree = xfs_dquot_tree(qi, cursor->dqtype);
+	struct xfs_dquot	*dq;
+	unsigned int		nr_found;
+
+	*next_incore_id = -1ULL;
+
+	mutex_lock(&qi->qi_tree_lock);
+	nr_found = radix_tree_gang_lookup(tree, (void **)&dq, cursor->id, 1);
+	if (nr_found)
+		*next_incore_id = dq->q_id;
+	mutex_unlock(&qi->qi_tree_lock);
+
+	trace_xchk_dquot_iter_advance_incore(cursor, *next_incore_id);
+}
+
+/*
+ * Walk all incore dquots of this filesystem.  Caller must set *@cursorp to
+ * zero before the first call, and must not hold the quota file ILOCK.
+ * Returns 1 and a valid *@dqpp; 0 and *@dqpp == NULL when there are no more
+ * dquots to iterate; or a negative errno.
+ */
+int
+xchk_dquot_iter(
+	struct xchk_dqiter	*cursor,
+	struct xfs_dquot	**dqpp)
+{
+	struct xfs_mount	*mp = cursor->sc->mp;
+	struct xfs_dquot	*dq = NULL;
+	uint64_t		next_ondisk, next_incore = -1ULL;
+	unsigned int		lock_mode;
+	int			error = 0;
+
+	if (cursor->id > XFS_DQ_ID_MAX)
+		return 0;
+	next_ondisk = cursor->id;
+
+	/* Revalidate and/or advance the cursor. */
+	lock_mode = xfs_ilock_data_map_shared(cursor->quota_ip);
+	error = xchk_dquot_iter_revalidate_bmap(cursor);
+	if (!error && !xfs_bmap_is_real_extent(&cursor->bmap))
+		error = xchk_dquot_iter_advance_bmap(cursor, &next_ondisk);
+	xfs_iunlock(cursor->quota_ip, lock_mode);
+	if (error)
+		return error;
+
+	if (next_ondisk > cursor->id)
+		xchk_dquot_iter_advance_incore(cursor, &next_incore);
+
+	/* Pick the next dquot in the sequence and return it. */
+	cursor->id = min(next_ondisk, next_incore);
+	if (cursor->id > XFS_DQ_ID_MAX)
+		return 0;
+
+	trace_xchk_dquot_iter(cursor, cursor->id);
+
+	error = xfs_qm_dqget(mp, cursor->id, cursor->dqtype, false, &dq);
+	if (error)
+		return error;
+
+	cursor->id = dq->q_id + 1;
+	*dqpp = dq;
+	return 1;
+}
diff --git a/fs/xfs/scrub/fsb_bitmap.h b/fs/xfs/scrub/fsb_bitmap.h
new file mode 100644
index 000000000000..40b462c1dd0d
--- /dev/null
+++ b/fs/xfs/scrub/fsb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FSB_BITMAP_H__
+#define __XFS_SCRUB_FSB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_fsblock_t */
+
+struct xfsb_bitmap {
+	struct xbitmap64	fsbitmap;
+};
+
+static inline void xfsb_bitmap_init(struct xfsb_bitmap *bitmap)
+{
+	xbitmap64_init(&bitmap->fsbitmap);
+}
+
+static inline void xfsb_bitmap_destroy(struct xfsb_bitmap *bitmap)
+{
+	xbitmap64_destroy(&bitmap->fsbitmap);
+}
+
+static inline int xfsb_bitmap_set(struct xfsb_bitmap *bitmap,
+		xfs_fsblock_t start, xfs_filblks_t len)
+{
+	return xbitmap64_set(&bitmap->fsbitmap, start, len);
+}
+
+static inline int xfsb_bitmap_walk(struct xfsb_bitmap *bitmap,
+		xbitmap64_walk_fn fn, void *priv)
+{
+	return xbitmap64_walk(&bitmap->fsbitmap, fn, priv);
+}
+
+#endif	/* __XFS_SCRUB_FSB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 5e2b09ed6e29..531006910ca9 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -10,8 +10,6 @@
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_btree.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
 #include "xfs_ag.h"
 #include "xfs_health.h"
 #include "scrub/scrub.h"
@@ -118,6 +116,38 @@ xchk_health_mask_for_scrub_type(
 }
 
 /*
+ * If the scrub state is clean, add @mask to the scrub sick mask to clear
+ * additional sick flags from the metadata object's sick state.
+ */
+void
+xchk_mark_healthy_if_clean(
+	struct xfs_scrub	*sc,
+	unsigned int		mask)
+{
+	if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+				  XFS_SCRUB_OFLAG_XCORRUPT)))
+		sc->sick_mask |= mask;
+}
+
+/*
+ * If we're scrubbing a piece of file metadata for the first time, does it look
+ * like it has been zapped?  Skip the check if we just repaired the metadata
+ * and are revalidating it.
+ */
+bool
+xchk_file_looks_zapped(
+	struct xfs_scrub	*sc,
+	unsigned int		mask)
+{
+	ASSERT((mask & ~XFS_SICK_INO_ZAPPED) == 0);
+
+	if (sc->flags & XREP_ALREADY_FIXED)
+		return false;
+
+	return xfs_inode_has_sickness(sc->ip, mask);
+}
+
+/*
  * Update filesystem health assessments based on what we found and did.
  *
  * If the scrubber finds errors, we mark sick whatever's mentioned in
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
index 66a273f8585b..a731b2467399 100644
--- a/fs/xfs/scrub/health.h
+++ b/fs/xfs/scrub/health.h
@@ -10,5 +10,7 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
 void xchk_update_health(struct xfs_scrub *sc);
 bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
 		xfs_btnum_t btnum);
+void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask);
+bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask);
 
 #endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index fb7bbf47ae5d..a720fc62262a 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -585,7 +585,7 @@ xchk_iallocbt_rec(
 	uint16_t			holemask;
 
 	xfs_inobt_btrec_to_irec(mp, rec, &irec);
-	if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) {
+	if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 		return 0;
 	}
@@ -708,11 +708,10 @@ xchk_iallocbt_xref_rmap_inodes(
 		xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
 }
 
-/* Scrub the inode btrees for some AG. */
-STATIC int
+/* Scrub one of the inode btrees for some AG. */
+int
 xchk_iallocbt(
-	struct xfs_scrub	*sc,
-	xfs_btnum_t		which)
+	struct xfs_scrub	*sc)
 {
 	struct xfs_btree_cur	*cur;
 	struct xchk_iallocbt	iabt = {
@@ -720,9 +719,23 @@ xchk_iallocbt(
 		.next_startino	= NULLAGINO,
 		.next_cluster_ino = NULLAGINO,
 	};
+	xfs_btnum_t		which;
 	int			error;
 
-	cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
+	switch (sc->sm->sm_type) {
+	case XFS_SCRUB_TYPE_INOBT:
+		cur = sc->sa.ino_cur;
+		which = XFS_BTNUM_INO;
+		break;
+	case XFS_SCRUB_TYPE_FINOBT:
+		cur = sc->sa.fino_cur;
+		which = XFS_BTNUM_FINO;
+		break;
+	default:
+		ASSERT(0);
+		return -EIO;
+	}
+
 	error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT,
 			&iabt);
 	if (error)
@@ -743,20 +756,6 @@ xchk_iallocbt(
 	return error;
 }
 
-int
-xchk_inobt(
-	struct xfs_scrub	*sc)
-{
-	return xchk_iallocbt(sc, XFS_BTNUM_INO);
-}
-
-int
-xchk_finobt(
-	struct xfs_scrub	*sc)
-{
-	return xchk_iallocbt(sc, XFS_BTNUM_FINO);
-}
-
 /* See if an inode btree has (or doesn't have) an inode chunk record. */
 static inline void
 xchk_xref_inode_check(
diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c
new file mode 100644
index 000000000000..b3f7182dd2f5
--- /dev/null
+++ b/fs/xfs/scrub/ialloc_repair.c
@@ -0,0 +1,884 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Inode Btree Repair
+ * ==================
+ *
+ * A quick refresher of inode btrees on a v5 filesystem:
+ *
+ * - Inode records are read into memory in units of 'inode clusters'.  However
+ *   many inodes fit in a cluster buffer is the smallest number of inodes that
+ *   can be allocated or freed.  Clusters are never smaller than one fs block
+ *   though they can span multiple blocks.  The size (in fs blocks) is
+ *   computed with xfs_icluster_size_fsb().  The fs block alignment of a
+ *   cluster is computed with xfs_ialloc_cluster_alignment().
+ *
+ * - Each inode btree record can describe a single 'inode chunk'.  The chunk
+ *   size is defined to be 64 inodes.  If sparse inodes are enabled, every
+ *   inobt record must be aligned to the chunk size; if not, every record must
+ *   be aligned to the start of a cluster.  It is possible to construct an XFS
+ *   geometry where one inobt record maps to multiple inode clusters; it is
+ *   also possible to construct a geometry where multiple inobt records map to
+ *   different parts of one inode cluster.
+ *
+ * - If sparse inodes are not enabled, the smallest unit of allocation for
+ *   inode records is enough to contain one inode chunk's worth of inodes.
+ *
+ * - If sparse inodes are enabled, the holemask field will be active.  Each
+ *   bit of the holemask represents 4 potential inodes; if set, the
+ *   corresponding space does *not* contain inodes and must be left alone.
+ *   Clusters cannot be smaller than 4 inodes.  The smallest unit of allocation
+ *   of inode records is one inode cluster.
+ *
+ * So what's the rebuild algorithm?
+ *
+ * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT
+ * records.  The OWN_INOBT records are the old inode btree blocks and will be
+ * cleared out after we've rebuilt the tree.  Each possible inode cluster
+ * within an OWN_INODES record will be read in; for each possible inobt record
+ * associated with that cluster, compute the freemask calculated from the
+ * i_mode data in the inode chunk.  For sparse inodes the holemask will be
+ * calculated by creating the properly aligned inobt record and punching out
+ * any chunk that's missing.  Inode allocations and frees grab the AGI first,
+ * so repair protects itself from concurrent access by locking the AGI.
+ *
+ * Once we've reconstructed all the inode records, we can create new inode
+ * btree roots and reload the btrees.  We rebuild both inode trees at the same
+ * time because they have the same rmap owner and it would be more complex to
+ * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT
+ * blocks it owns.  We have all the data we need to build both, so dump
+ * everything and start over.
+ *
+ * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
+ */
+
+struct xrep_ibt {
+	/* Record under construction. */
+	struct xfs_inobt_rec_incore	rie;
+
+	/* new inobt information */
+	struct xrep_newbt	new_inobt;
+
+	/* new finobt information */
+	struct xrep_newbt	new_finobt;
+
+	/* Old inode btree blocks we found in the rmap. */
+	struct xagb_bitmap	old_iallocbt_blocks;
+
+	/* Reconstructed inode records. */
+	struct xfarray		*inode_records;
+
+	struct xfs_scrub	*sc;
+
+	/* Number of inodes assigned disk space. */
+	unsigned int		icount;
+
+	/* Number of inodes in use. */
+	unsigned int		iused;
+
+	/* Number of finobt records needed. */
+	unsigned int		finobt_recs;
+
+	/* get_records()'s position in the inode record array. */
+	xfarray_idx_t		array_cur;
+};
+
+/*
+ * Is this inode in use?  If the inode is in memory we can tell from i_mode,
+ * otherwise we have to check di_mode in the on-disk buffer.  We only care
+ * that the high (i.e. non-permission) bits of _mode are zero.  This should be
+ * safe because repair keeps all AG headers locked until the end, and process
+ * trying to perform an inode allocation/free must lock the AGI.
+ *
+ * @cluster_ag_base is the inode offset of the cluster within the AG.
+ * @cluster_bp is the cluster buffer.
+ * @cluster_index is the inode offset within the inode cluster.
+ */
+STATIC int
+xrep_ibt_check_ifree(
+	struct xrep_ibt		*ri,
+	xfs_agino_t		cluster_ag_base,
+	struct xfs_buf		*cluster_bp,
+	unsigned int		cluster_index,
+	bool			*inuse)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_dinode	*dip;
+	xfs_ino_t		fsino;
+	xfs_agino_t		agino;
+	xfs_agnumber_t		agno = ri->sc->sa.pag->pag_agno;
+	unsigned int		cluster_buf_base;
+	unsigned int		offset;
+	int			error;
+
+	agino = cluster_ag_base + cluster_index;
+	fsino = XFS_AGINO_TO_INO(mp, agno, agino);
+
+	/* Inode uncached or half assembled, read disk buffer */
+	cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
+	offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize;
+	if (offset >= BBTOB(cluster_bp->b_length))
+		return -EFSCORRUPTED;
+	dip = xfs_buf_offset(cluster_bp, offset);
+	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
+		return -EFSCORRUPTED;
+
+	if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)
+		return -EFSCORRUPTED;
+
+	/* Will the in-core inode tell us if it's in use? */
+	error = xchk_inode_is_allocated(sc, agino, inuse);
+	if (!error)
+		return 0;
+
+	*inuse = dip->di_mode != 0;
+	return 0;
+}
+
+/* Stash the accumulated inobt record for rebuilding. */
+STATIC int
+xrep_ibt_stash(
+	struct xrep_ibt		*ri)
+{
+	int			error = 0;
+
+	if (xchk_should_terminate(ri->sc, &error))
+		return error;
+
+	ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie);
+	if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL)
+		return -EFSCORRUPTED;
+
+	if (ri->rie.ir_freecount > 0)
+		ri->finobt_recs++;
+
+	trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie);
+
+	error = xfarray_append(ri->inode_records, &ri->rie);
+	if (error)
+		return error;
+
+	ri->rie.ir_startino = NULLAGINO;
+	return 0;
+}
+
+/*
+ * Given an extent of inodes and an inode cluster buffer, calculate the
+ * location of the corresponding inobt record (creating it if necessary),
+ * then update the parts of the holemask and freemask of that record that
+ * correspond to the inode extent we were given.
+ *
+ * @cluster_ir_startino is the AG inode number of an inobt record that we're
+ * proposing to create for this inode cluster.  If sparse inodes are enabled,
+ * we must round down to a chunk boundary to find the actual sparse record.
+ * @cluster_bp is the buffer of the inode cluster.
+ * @nr_inodes is the number of inodes to check from the cluster.
+ */
+STATIC int
+xrep_ibt_cluster_record(
+	struct xrep_ibt		*ri,
+	xfs_agino_t		cluster_ir_startino,
+	struct xfs_buf		*cluster_bp,
+	unsigned int		nr_inodes)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agino_t		ir_startino;
+	unsigned int		cluster_base;
+	unsigned int		cluster_index;
+	int			error = 0;
+
+	ir_startino = cluster_ir_startino;
+	if (xfs_has_sparseinodes(mp))
+		ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK);
+	cluster_base = cluster_ir_startino - ir_startino;
+
+	/*
+	 * If the accumulated inobt record doesn't map this cluster, add it to
+	 * the list and reset it.
+	 */
+	if (ri->rie.ir_startino != NULLAGINO &&
+	    ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
+		error = xrep_ibt_stash(ri);
+		if (error)
+			return error;
+	}
+
+	if (ri->rie.ir_startino == NULLAGINO) {
+		ri->rie.ir_startino = ir_startino;
+		ri->rie.ir_free = XFS_INOBT_ALL_FREE;
+		ri->rie.ir_holemask = 0xFFFF;
+		ri->rie.ir_count = 0;
+	}
+
+	/* Record the whole cluster. */
+	ri->icount += nr_inodes;
+	ri->rie.ir_count += nr_inodes;
+	ri->rie.ir_holemask &= ~xfs_inobt_maskn(
+				cluster_base / XFS_INODES_PER_HOLEMASK_BIT,
+				nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
+
+	/* Which inodes within this cluster are free? */
+	for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+		bool		inuse = false;
+
+		error = xrep_ibt_check_ifree(ri, cluster_ir_startino,
+				cluster_bp, cluster_index, &inuse);
+		if (error)
+			return error;
+		if (!inuse)
+			continue;
+		ri->iused++;
+		ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base +
+						   cluster_index);
+	}
+	return 0;
+}
+
+/*
+ * For each inode cluster covering the physical extent recorded by the rmapbt,
+ * we must calculate the properly aligned startino of that cluster, then
+ * iterate each cluster to fill in used and filled masks appropriately.  We
+ * then use the (startino, used, filled) information to construct the
+ * appropriate inode records.
+ */
+STATIC int
+xrep_ibt_process_cluster(
+	struct xrep_ibt		*ri,
+	xfs_agblock_t		cluster_bno)
+{
+	struct xfs_imap		imap;
+	struct xfs_buf		*cluster_bp;
+	struct xfs_scrub	*sc = ri->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	xfs_agino_t		cluster_ag_base;
+	xfs_agino_t		irec_index;
+	unsigned int		nr_inodes;
+	int			error;
+
+	nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster,
+			XFS_INODES_PER_CHUNK);
+
+	/*
+	 * Grab the inode cluster buffer.  This is safe to do with a broken
+	 * inobt because imap_to_bp directly maps the buffer without touching
+	 * either inode btree.
+	 */
+	imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno);
+	imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
+	imap.im_boffset = 0;
+	error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
+	if (error)
+		return error;
+
+	/*
+	 * Record the contents of each possible inobt record mapping this
+	 * cluster.
+	 */
+	cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno);
+	for (irec_index = 0;
+	     irec_index < igeo->inodes_per_cluster;
+	     irec_index += XFS_INODES_PER_CHUNK) {
+		error = xrep_ibt_cluster_record(ri,
+				cluster_ag_base + irec_index, cluster_bp,
+				nr_inodes);
+		if (error)
+			break;
+
+	}
+
+	xfs_trans_brelse(sc->tp, cluster_bp);
+	return error;
+}
+
+/* Check for any obvious conflicts in the inode chunk extent. */
+STATIC int
+xrep_ibt_check_inode_ext(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+	xfs_agino_t		agino;
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	/* Inode records must be within the AG. */
+	if (!xfs_verify_agbext(sc->sa.pag, agbno, len))
+		return -EFSCORRUPTED;
+
+	/* The entire record must align to the inode cluster size. */
+	if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
+	    !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster))
+		return -EFSCORRUPTED;
+
+	/*
+	 * The entire record must also adhere to the inode cluster alignment
+	 * size if sparse inodes are not enabled.
+	 */
+	if (!xfs_has_sparseinodes(mp) &&
+	    (!IS_ALIGNED(agbno, igeo->cluster_align) ||
+	     !IS_ALIGNED(agbno + len, igeo->cluster_align)))
+		return -EFSCORRUPTED;
+
+	/*
+	 * On a sparse inode fs, this cluster could be part of a sparse chunk.
+	 * Sparse clusters must be aligned to sparse chunk alignment.
+	 */
+	if (xfs_has_sparseinodes(mp) &&
+	    (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
+	     !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
+		return -EFSCORRUPTED;
+
+	/* Make sure the entire range of blocks are valid AG inodes. */
+	agino = XFS_AGB_TO_AGINO(mp, agbno);
+	if (!xfs_verify_agino(sc->sa.pag, agino))
+		return -EFSCORRUPTED;
+
+	agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1;
+	if (!xfs_verify_agino(sc->sa.pag, agino))
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space. */
+	error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
+	if (error)
+		return error;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/* Found a fragment of the old inode btrees; dispose of them later. */
+STATIC int
+xrep_ibt_record_old_btree_blocks(
+	struct xrep_ibt			*ri,
+	const struct xfs_rmap_irec	*rec)
+{
+	if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
+				rec->rm_blockcount))
+		return -EFSCORRUPTED;
+
+	return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock,
+			rec->rm_blockcount);
+}
+
+/* Record extents that belong to inode cluster blocks. */
+STATIC int
+xrep_ibt_record_inode_blocks(
+	struct xrep_ibt			*ri,
+	const struct xfs_rmap_irec	*rec)
+{
+	struct xfs_mount		*mp = ri->sc->mp;
+	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
+	xfs_agblock_t			cluster_base;
+	int				error;
+
+	error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
+			rec->rm_blockcount);
+	if (error)
+		return error;
+
+	trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno,
+			rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
+			rec->rm_offset, rec->rm_flags);
+
+	/*
+	 * Record the free/hole masks for each inode cluster that could be
+	 * mapped by this rmap record.
+	 */
+	for (cluster_base = 0;
+	     cluster_base < rec->rm_blockcount;
+	     cluster_base += igeo->blocks_per_cluster) {
+		error = xrep_ibt_process_cluster(ri,
+				rec->rm_startblock + cluster_base);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+STATIC int
+xrep_ibt_walk_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_ibt			*ri = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(ri->sc, &error))
+		return error;
+
+	switch (rec->rm_owner) {
+	case XFS_RMAP_OWN_INOBT:
+		return xrep_ibt_record_old_btree_blocks(ri, rec);
+	case XFS_RMAP_OWN_INODES:
+		return xrep_ibt_record_inode_blocks(ri, rec);
+	}
+	return 0;
+}
+
+/*
+ * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode
+ * btrees (OWN_INOBT).  Figure out if we have enough free space to reconstruct
+ * the inode btrees.  The caller must clean up the lists if anything goes
+ * wrong.
+ */
+STATIC int
+xrep_ibt_find_inodes(
+	struct xrep_ibt		*ri)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	int			error;
+
+	ri->rie.ir_startino = NULLAGINO;
+
+	/* Collect all reverse mappings for inode blocks. */
+	xrep_ag_btcur_init(sc, &sc->sa);
+	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
+	xchk_ag_btcur_free(&sc->sa);
+	if (error)
+		return error;
+
+	/* If we have a record ready to go, add it to the array. */
+	if (ri->rie.ir_startino != NULLAGINO)
+		return xrep_ibt_stash(ri);
+
+	return 0;
+}
+
+/* Update the AGI counters. */
+STATIC int
+xrep_ibt_reset_counters(
+	struct xrep_ibt		*ri)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	struct xfs_agi		*agi = sc->sa.agi_bp->b_addr;
+	unsigned int		freecount = ri->icount - ri->iused;
+
+	/* Trigger inode count recalculation */
+	xfs_force_summary_recalc(sc->mp);
+
+	/*
+	 * The AGI header contains extra information related to the inode
+	 * btrees, so we must update those fields here.
+	 */
+	agi->agi_count = cpu_to_be32(ri->icount);
+	agi->agi_freecount = cpu_to_be32(freecount);
+	xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
+			   XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+
+	/* Reinitialize with the values we just logged. */
+	return xrep_reinit_pagi(sc);
+}
+
+/* Retrieve finobt data for bulk load. */
+STATIC int
+xrep_fibt_get_records(
+	struct xfs_btree_cur		*cur,
+	unsigned int			idx,
+	struct xfs_btree_block		*block,
+	unsigned int			nr_wanted,
+	void				*priv)
+{
+	struct xfs_inobt_rec_incore	*irec = &cur->bc_rec.i;
+	struct xrep_ibt			*ri = priv;
+	union xfs_btree_rec		*block_rec;
+	unsigned int			loaded;
+	int				error;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		do {
+			error = xfarray_load(ri->inode_records,
+					ri->array_cur++, irec);
+		} while (error == 0 && xfs_inobt_rec_freecount(irec) == 0);
+		if (error)
+			return error;
+
+		block_rec = xfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Retrieve inobt data for bulk load. */
+STATIC int
+xrep_ibt_get_records(
+	struct xfs_btree_cur		*cur,
+	unsigned int			idx,
+	struct xfs_btree_block		*block,
+	unsigned int			nr_wanted,
+	void				*priv)
+{
+	struct xfs_inobt_rec_incore	*irec = &cur->bc_rec.i;
+	struct xrep_ibt			*ri = priv;
+	union xfs_btree_rec		*block_rec;
+	unsigned int			loaded;
+	int				error;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		error = xfarray_load(ri->inode_records, ri->array_cur++, irec);
+		if (error)
+			return error;
+
+		block_rec = xfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new inobt blocks to the bulk loader. */
+STATIC int
+xrep_ibt_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_ibt		*ri = priv;
+
+	return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr);
+}
+
+/* Feed one of the new finobt blocks to the bulk loader. */
+STATIC int
+xrep_fibt_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_ibt		*ri = priv;
+
+	return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr);
+}
+
+/* Make sure the records do not overlap in inumber address space. */
+STATIC int
+xrep_ibt_check_overlap(
+	struct xrep_ibt			*ri)
+{
+	struct xfs_inobt_rec_incore	irec;
+	xfarray_idx_t			cur;
+	xfs_agino_t			next_agino = 0;
+	int				error = 0;
+
+	foreach_xfarray_idx(ri->inode_records, cur) {
+		if (xchk_should_terminate(ri->sc, &error))
+			return error;
+
+		error = xfarray_load(ri->inode_records, cur, &irec);
+		if (error)
+			return error;
+
+		if (irec.ir_startino < next_agino)
+			return -EFSCORRUPTED;
+
+		next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK;
+	}
+
+	return error;
+}
+
+/* Build new inode btrees and dispose of the old one. */
+STATIC int
+xrep_ibt_build_new_trees(
+	struct xrep_ibt		*ri)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	struct xfs_btree_cur	*ino_cur;
+	struct xfs_btree_cur	*fino_cur = NULL;
+	xfs_fsblock_t		fsbno;
+	bool			need_finobt;
+	int			error;
+
+	need_finobt = xfs_has_finobt(sc->mp);
+
+	/*
+	 * Create new btrees for staging all the inobt records we collected
+	 * earlier.  The records were collected in order of increasing agino,
+	 * so we do not have to sort them.  Ensure there are no overlapping
+	 * records.
+	 */
+	error = xrep_ibt_check_overlap(ri);
+	if (error)
+		return error;
+
+	/*
+	 * The new inode btrees will not be rooted in the AGI until we've
+	 * successfully rebuilt the tree.
+	 *
+	 * Start by setting up the inobt staging cursor.
+	 */
+	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+			XFS_IBT_BLOCK(sc->mp)),
+	xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno,
+			XFS_AG_RESV_NONE);
+	ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
+	ri->new_inobt.bload.get_records = xrep_ibt_get_records;
+
+	ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake,
+			XFS_BTNUM_INO);
+	error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
+			xfarray_length(ri->inode_records));
+	if (error)
+		goto err_inocur;
+
+	/* Set up finobt staging cursor. */
+	if (need_finobt) {
+		enum xfs_ag_resv_type	resv = XFS_AG_RESV_METADATA;
+
+		if (sc->mp->m_finobt_nores)
+			resv = XFS_AG_RESV_NONE;
+
+		fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+				XFS_FIBT_BLOCK(sc->mp)),
+		xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
+				fsbno, resv);
+		ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
+		ri->new_finobt.bload.get_records = xrep_fibt_get_records;
+
+		fino_cur = xfs_inobt_stage_cursor(sc->sa.pag,
+				&ri->new_finobt.afake, XFS_BTNUM_FINO);
+		error = xfs_btree_bload_compute_geometry(fino_cur,
+				&ri->new_finobt.bload, ri->finobt_recs);
+		if (error)
+			goto err_finocur;
+	}
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		goto err_finocur;
+
+	/* Reserve all the space we need to build the new btrees. */
+	error = xrep_newbt_alloc_blocks(&ri->new_inobt,
+			ri->new_inobt.bload.nr_blocks);
+	if (error)
+		goto err_finocur;
+
+	if (need_finobt) {
+		error = xrep_newbt_alloc_blocks(&ri->new_finobt,
+				ri->new_finobt.bload.nr_blocks);
+		if (error)
+			goto err_finocur;
+	}
+
+	/* Add all inobt records. */
+	ri->array_cur = XFARRAY_CURSOR_INIT;
+	error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri);
+	if (error)
+		goto err_finocur;
+
+	/* Add all finobt records. */
+	if (need_finobt) {
+		ri->array_cur = XFARRAY_CURSOR_INIT;
+		error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri);
+		if (error)
+			goto err_finocur;
+	}
+
+	/*
+	 * Install the new btrees in the AG header.  After this point the old
+	 * btrees are no longer accessible and the new trees are live.
+	 */
+	xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
+	xfs_btree_del_cursor(ino_cur, 0);
+
+	if (fino_cur) {
+		xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
+		xfs_btree_del_cursor(fino_cur, 0);
+	}
+
+	/* Reset the AGI counters now that we've changed the inode roots. */
+	error = xrep_ibt_reset_counters(ri);
+	if (error)
+		goto err_finobt;
+
+	/* Free unused blocks and bitmap. */
+	if (need_finobt) {
+		error = xrep_newbt_commit(&ri->new_finobt);
+		if (error)
+			goto err_inobt;
+	}
+	error = xrep_newbt_commit(&ri->new_inobt);
+	if (error)
+		return error;
+
+	return xrep_roll_ag_trans(sc);
+
+err_finocur:
+	if (need_finobt)
+		xfs_btree_del_cursor(fino_cur, error);
+err_inocur:
+	xfs_btree_del_cursor(ino_cur, error);
+err_finobt:
+	if (need_finobt)
+		xrep_newbt_cancel(&ri->new_finobt);
+err_inobt:
+	xrep_newbt_cancel(&ri->new_inobt);
+	return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_ibt_remove_old_trees(
+	struct xrep_ibt		*ri)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	int			error;
+
+	/*
+	 * Free the old inode btree blocks if they're not in use.  It's ok to
+	 * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG
+	 * reservation because we reset the reservation before releasing the
+	 * AGI and AGF header buffer locks.
+	 */
+	error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
+			&XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE);
+	if (error)
+		return error;
+
+	/*
+	 * If the finobt is enabled and has a per-AG reservation, make sure we
+	 * reinitialize the per-AG reservations.
+	 */
+	if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
+		sc->flags |= XREP_RESET_PERAG_RESV;
+
+	return 0;
+}
+
+/* Repair both inode btrees. */
+int
+xrep_iallocbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_ibt		*ri;
+	struct xfs_mount	*mp = sc->mp;
+	char			*descr;
+	xfs_agino_t		first_agino, last_agino;
+	int			error = 0;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_has_rmapbt(mp))
+		return -EOPNOTSUPP;
+
+	ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS);
+	if (!ri)
+		return -ENOMEM;
+	ri->sc = sc;
+
+	/* We rebuild both inode btrees. */
+	sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
+
+	/* Set up enough storage to handle an AG with nothing but inodes. */
+	xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino);
+	last_agino /= XFS_INODES_PER_CHUNK;
+	descr = xchk_xfile_ag_descr(sc, "inode index records");
+	error = xfarray_create(descr, last_agino,
+			sizeof(struct xfs_inobt_rec_incore),
+			&ri->inode_records);
+	kfree(descr);
+	if (error)
+		goto out_ri;
+
+	/* Collect the inode data and find the old btree blocks. */
+	xagb_bitmap_init(&ri->old_iallocbt_blocks);
+	error = xrep_ibt_find_inodes(ri);
+	if (error)
+		goto out_bitmap;
+
+	/* Rebuild the inode indexes. */
+	error = xrep_ibt_build_new_trees(ri);
+	if (error)
+		goto out_bitmap;
+
+	/* Kill the old tree. */
+	error = xrep_ibt_remove_old_trees(ri);
+	if (error)
+		goto out_bitmap;
+
+out_bitmap:
+	xagb_bitmap_destroy(&ri->old_iallocbt_blocks);
+	xfarray_destroy(ri->inode_records);
+out_ri:
+	kfree(ri);
+	return error;
+}
+
+/* Make sure both btrees are ok after we've rebuilt them. */
+int
+xrep_revalidate_iallocbt(
+	struct xfs_scrub	*sc)
+{
+	__u32			old_type = sc->sm->sm_type;
+	int			error;
+
+	/*
+	 * We must update sm_type temporarily so that the tree-to-tree cross
+	 * reference checks will work in the correct direction, and also so
+	 * that tracing will report correctly if there are more errors.
+	 */
+	sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
+	error = xchk_iallocbt(sc);
+	if (error)
+		goto out;
+
+	if (xfs_has_finobt(sc->mp)) {
+		sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
+		error = xchk_iallocbt(sc);
+	}
+
+out:
+	sc->sm->sm_type = old_type;
+	return error;
+}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 889f556bc98f..6e2fe2d6250b 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -25,6 +25,7 @@
 #include "scrub/common.h"
 #include "scrub/btree.h"
 #include "scrub/trace.h"
+#include "scrub/repair.h"
 
 /* Prepare the attached inode for scrubbing. */
 static inline int
@@ -39,6 +40,10 @@ xchk_prepare_iscrub(
 	if (error)
 		return error;
 
+	error = xchk_ino_dqattach(sc);
+	if (error)
+		return error;
+
 	xchk_ilock(sc, XFS_ILOCK_EXCL);
 	return 0;
 }
@@ -95,8 +100,8 @@ xchk_setup_inode(
 	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
 		return -ENOENT;
 
-	/* Try a regular untrusted iget. */
-	error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+	/* Try a safe untrusted iget. */
+	error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
 	if (!error)
 		return xchk_install_handle_iscrub(sc, ip);
 	if (error == -ENOENT)
@@ -181,8 +186,11 @@ xchk_setup_inode(
 	 * saying the inode is allocated and the icache being unable to load
 	 * the inode until we can flag the corruption in xchk_inode.  The
 	 * scrub function has to note the corruption, since we're not really
-	 * supposed to do that from the setup function.
+	 * supposed to do that from the setup function.  Save the mapping to
+	 * make repairs to the ondisk inode buffer.
 	 */
+	if (xchk_could_repair(sc))
+		xrep_setup_inode(sc, &imap);
 	return 0;
 
 out_cancel:
@@ -338,6 +346,10 @@ xchk_inode_flags2(
 	if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp))
 		goto bad;
 
+	/* no large extent counts without the filesystem feature */
+	if ((flags2 & XFS_DIFLAG2_NREXT64) && !xfs_has_large_extent_counts(mp))
+		goto bad;
+
 	return;
 bad:
 	xchk_ino_set_corrupt(sc, ino);
@@ -548,7 +560,7 @@ xchk_dinode(
 	}
 
 	/* di_forkoff */
-	if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
+	if (XFS_DFORK_BOFF(dip) >= mp->m_sb.sb_inodesize)
 		xchk_ino_set_corrupt(sc, ino);
 	if (naextents != 0 && dip->di_forkoff == 0)
 		xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
new file mode 100644
index 000000000000..0ca62d59f84a
--- /dev/null
+++ b/fs/xfs/scrub/inode_repair.c
@@ -0,0 +1,1525 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_ialloc.h"
+#include "xfs_da_format.h"
+#include "xfs_reflink.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_quota_defs.h"
+#include "xfs_quota.h"
+#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_log_priv.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Inode Record Repair
+ * ===================
+ *
+ * Roughly speaking, inode problems can be classified based on whether or not
+ * they trip the dinode verifiers.  If those trip, then we won't be able to
+ * xfs_iget ourselves the inode.
+ *
+ * Therefore, the xrep_dinode_* functions fix anything that will cause the
+ * inode buffer verifier or the dinode verifier.  The xrep_inode_* functions
+ * fix things on live incore inodes.  The inode repair functions make decisions
+ * with security and usability implications when reviving a file:
+ *
+ * - Files with zero di_mode or a garbage di_mode are converted to regular file
+ *   that only root can read.  This file may not actually contain user data,
+ *   if the file was not previously a regular file.  Setuid and setgid bits
+ *   are cleared.
+ *
+ * - Zero-size directories can be truncated to look empty.  It is necessary to
+ *   run the bmapbtd and directory repair functions to fully rebuild the
+ *   directory.
+ *
+ * - Zero-size symbolic link targets can be truncated to '?'.  It is necessary
+ *   to run the bmapbtd and symlink repair functions to salvage the symlink.
+ *
+ * - Invalid extent size hints will be removed.
+ *
+ * - Quotacheck will be scheduled if we repaired an inode that was so badly
+ *   damaged that the ondisk inode had to be rebuilt.
+ *
+ * - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
+ *   Setuid and setgid bits are cleared.
+ *
+ * - Data and attr forks are reset to extents format with zero extents if the
+ *   fork data is inconsistent.  It is necessary to run the bmapbtd or bmapbta
+ *   repair functions to recover the space mapping.
+ *
+ * - ACLs will not be recovered if the attr fork is zapped or the extended
+ *   attribute structure itself requires salvaging.
+ *
+ * - If the attr fork is zapped, the user and group ids are reset to root and
+ *   the setuid and setgid bits are removed.
+ */
+
+/*
+ * All the information we need to repair the ondisk inode if we can't iget the
+ * incore inode.  We don't allocate this buffer unless we're going to perform
+ * a repair to the ondisk inode cluster buffer.
+ */
+struct xrep_inode {
+	/* Inode mapping that we saved from the initial lookup attempt. */
+	struct xfs_imap		imap;
+
+	struct xfs_scrub	*sc;
+
+	/* Blocks in use on the data device by data extents or bmbt blocks. */
+	xfs_rfsblock_t		data_blocks;
+
+	/* Blocks in use on the rt device. */
+	xfs_rfsblock_t		rt_blocks;
+
+	/* Blocks in use by the attr fork. */
+	xfs_rfsblock_t		attr_blocks;
+
+	/* Number of data device extents for the data fork. */
+	xfs_extnum_t		data_extents;
+
+	/*
+	 * Number of realtime device extents for the data fork.  If
+	 * data_extents and rt_extents indicate that the data fork has extents
+	 * on both devices, we'll just back away slowly.
+	 */
+	xfs_extnum_t		rt_extents;
+
+	/* Number of (data device) extents for the attr fork. */
+	xfs_aextnum_t		attr_extents;
+
+	/* Sick state to set after zapping parts of the inode. */
+	unsigned int		ino_sick_mask;
+
+	/* Must we remove all access from this file? */
+	bool			zap_acls;
+};
+
+/*
+ * Setup function for inode repair.  @imap contains the ondisk inode mapping
+ * information so that we can correct the ondisk inode cluster buffer if
+ * necessary to make iget work.
+ */
+int
+xrep_setup_inode(
+	struct xfs_scrub	*sc,
+	const struct xfs_imap	*imap)
+{
+	struct xrep_inode	*ri;
+
+	sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
+	if (!sc->buf)
+		return -ENOMEM;
+
+	ri = sc->buf;
+	memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
+	ri->sc = sc;
+	return 0;
+}
+
+/*
+ * Make sure this ondisk inode can pass the inode buffer verifier.  This is
+ * not the same as the dinode verifier.
+ */
+STATIC void
+xrep_dinode_buf_core(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*bp,
+	unsigned int		ioffset)
+{
+	struct xfs_dinode	*dip = xfs_buf_offset(bp, ioffset);
+	struct xfs_trans	*tp = sc->tp;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agino_t		agino;
+	bool			crc_ok = false;
+	bool			magic_ok = false;
+	bool			unlinked_ok = false;
+
+	agino = be32_to_cpu(dip->di_next_unlinked);
+
+	if (xfs_verify_agino_or_null(bp->b_pag, agino))
+		unlinked_ok = true;
+
+	if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+	    xfs_dinode_good_version(mp, dip->di_version))
+		magic_ok = true;
+
+	if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+			XFS_DINODE_CRC_OFF))
+		crc_ok = true;
+
+	if (magic_ok && unlinked_ok && crc_ok)
+		return;
+
+	if (!magic_ok) {
+		dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+		dip->di_version = 3;
+	}
+	if (!unlinked_ok)
+		dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+	xfs_dinode_calc_crc(mp, dip);
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+	xfs_trans_log_buf(tp, bp, ioffset,
+				  ioffset + sizeof(struct xfs_dinode) - 1);
+}
+
+/* Make sure this inode cluster buffer can pass the inode buffer verifier. */
+STATIC void
+xrep_dinode_buf(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = sc->mp;
+	int			i;
+	int			ni;
+
+	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+	for (i = 0; i < ni; i++)
+		xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
+}
+
+/* Reinitialize things that never change in an inode. */
+STATIC void
+xrep_dinode_header(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip)
+{
+	trace_xrep_dinode_header(sc, dip);
+
+	dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+	if (!xfs_dinode_good_version(sc->mp, dip->di_version))
+		dip->di_version = 3;
+	dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
+	uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
+	dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
+}
+
+/* Turn di_mode into /something/ recognizable. */
+STATIC void
+xrep_dinode_mode(
+	struct xrep_inode	*ri,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	uint16_t		mode = be16_to_cpu(dip->di_mode);
+
+	trace_xrep_dinode_mode(sc, dip);
+
+	if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
+		return;
+
+	/* bad mode, so we set it to a file that only root can read */
+	mode = S_IFREG;
+	dip->di_mode = cpu_to_be16(mode);
+	dip->di_uid = 0;
+	dip->di_gid = 0;
+	ri->zap_acls = true;
+}
+
+/* Fix any conflicting flags that the verifiers complain about. */
+STATIC void
+xrep_dinode_flags(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	bool			isrt)
+{
+	struct xfs_mount	*mp = sc->mp;
+	uint64_t		flags2 = be64_to_cpu(dip->di_flags2);
+	uint16_t		flags = be16_to_cpu(dip->di_flags);
+	uint16_t		mode = be16_to_cpu(dip->di_mode);
+
+	trace_xrep_dinode_flags(sc, dip);
+
+	if (isrt)
+		flags |= XFS_DIFLAG_REALTIME;
+	else
+		flags &= ~XFS_DIFLAG_REALTIME;
+
+	/*
+	 * For regular files on a reflink filesystem, set the REFLINK flag to
+	 * protect shared extents.  A later stage will actually check those
+	 * extents and clear the flag if possible.
+	 */
+	if (xfs_has_reflink(mp) && S_ISREG(mode))
+		flags2 |= XFS_DIFLAG2_REFLINK;
+	else
+		flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
+	if (flags & XFS_DIFLAG_REALTIME)
+		flags2 &= ~XFS_DIFLAG2_REFLINK;
+	if (!xfs_has_bigtime(mp))
+		flags2 &= ~XFS_DIFLAG2_BIGTIME;
+	if (!xfs_has_large_extent_counts(mp))
+		flags2 &= ~XFS_DIFLAG2_NREXT64;
+	if (flags2 & XFS_DIFLAG2_NREXT64)
+		dip->di_nrext64_pad = 0;
+	else if (dip->di_version >= 3)
+		dip->di_v3_pad = 0;
+	dip->di_flags = cpu_to_be16(flags);
+	dip->di_flags2 = cpu_to_be64(flags2);
+}
+
+/*
+ * Blow out symlink; now it points nowhere.  We don't have to worry about
+ * incore state because this inode is failing the verifiers.
+ */
+STATIC void
+xrep_dinode_zap_symlink(
+	struct xrep_inode	*ri,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	char			*p;
+
+	trace_xrep_dinode_zap_symlink(sc, dip);
+
+	dip->di_format = XFS_DINODE_FMT_LOCAL;
+	dip->di_size = cpu_to_be64(1);
+	p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	*p = '?';
+	ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
+}
+
+/*
+ * Blow out dir, make the parent point to the root.  In the future repair will
+ * reconstruct this directory for us.  Note that there's no in-core directory
+ * inode because the sf verifier tripped, so we don't have to worry about the
+ * dentry cache.
+ */
+STATIC void
+xrep_dinode_zap_dir(
+	struct xrep_inode	*ri,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_dir2_sf_hdr	*sfp;
+	int			i8count;
+
+	trace_xrep_dinode_zap_dir(sc, dip);
+
+	dip->di_format = XFS_DINODE_FMT_LOCAL;
+	i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
+	sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+	sfp->count = 0;
+	sfp->i8count = i8count;
+	xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
+	dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
+	ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED;
+}
+
+/* Make sure we don't have a garbage file size. */
+STATIC void
+xrep_dinode_size(
+	struct xrep_inode	*ri,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	uint64_t		size = be64_to_cpu(dip->di_size);
+	uint16_t		mode = be16_to_cpu(dip->di_mode);
+
+	trace_xrep_dinode_size(sc, dip);
+
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		/* di_size can't be nonzero for special files */
+		dip->di_size = 0;
+		break;
+	case S_IFREG:
+		/* Regular files can't be larger than 2^63-1 bytes. */
+		dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
+		break;
+	case S_IFLNK:
+		/*
+		 * Truncate ridiculously oversized symlinks.  If the size is
+		 * zero, reset it to point to the current directory.  Both of
+		 * these conditions trigger dinode verifier errors, so there
+		 * is no in-core state to reset.
+		 */
+		if (size > XFS_SYMLINK_MAXLEN)
+			dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
+		else if (size == 0)
+			xrep_dinode_zap_symlink(ri, dip);
+		break;
+	case S_IFDIR:
+		/*
+		 * Directories can't have a size larger than 32G.  If the size
+		 * is zero, reset it to an empty directory.  Both of these
+		 * conditions trigger dinode verifier errors, so there is no
+		 * in-core state to reset.
+		 */
+		if (size > XFS_DIR2_SPACE_SIZE)
+			dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
+		else if (size == 0)
+			xrep_dinode_zap_dir(ri, dip);
+		break;
+	}
+}
+
+/* Fix extent size hints. */
+STATIC void
+xrep_dinode_extsize_hints(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_mount	*mp = sc->mp;
+	uint64_t		flags2 = be64_to_cpu(dip->di_flags2);
+	uint16_t		flags = be16_to_cpu(dip->di_flags);
+	uint16_t		mode = be16_to_cpu(dip->di_mode);
+
+	xfs_failaddr_t		fa;
+
+	trace_xrep_dinode_extsize_hints(sc, dip);
+
+	fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
+			mode, flags);
+	if (fa) {
+		dip->di_extsize = 0;
+		dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
+					      XFS_DIFLAG_EXTSZINHERIT);
+	}
+
+	if (dip->di_version < 3)
+		return;
+
+	fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
+			mode, flags, flags2);
+	if (fa) {
+		dip->di_cowextsize = 0;
+		dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
+	}
+}
+
+/* Count extents and blocks for an inode given an rmap. */
+STATIC int
+xrep_dinode_walk_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_inode		*ri = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(ri->sc, &error))
+		return error;
+
+	/* We only care about this inode. */
+	if (rec->rm_owner != ri->sc->sm->sm_ino)
+		return 0;
+
+	if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
+		ri->attr_blocks += rec->rm_blockcount;
+		if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+			ri->attr_extents++;
+
+		return 0;
+	}
+
+	ri->data_blocks += rec->rm_blockcount;
+	if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+		ri->data_extents++;
+
+	return 0;
+}
+
+/* Count extents and blocks for an inode from all AG rmap data. */
+STATIC int
+xrep_dinode_count_ag_rmaps(
+	struct xrep_inode	*ri,
+	struct xfs_perag	*pag)
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agf;
+	int			error;
+
+	error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf);
+	if (error)
+		return error;
+
+	cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
+	error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
+	xfs_btree_del_cursor(cur, error);
+	xfs_trans_brelse(ri->sc->tp, agf);
+	return error;
+}
+
+/* Count extents and blocks for a given inode from all rmap data. */
+STATIC int
+xrep_dinode_count_rmaps(
+	struct xrep_inode	*ri)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	int			error;
+
+	if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
+		return -EOPNOTSUPP;
+
+	for_each_perag(ri->sc->mp, agno, pag) {
+		error = xrep_dinode_count_ag_rmaps(ri, pag);
+		if (error) {
+			xfs_perag_rele(pag);
+			return error;
+		}
+	}
+
+	/* Can't have extents on both the rt and the data device. */
+	if (ri->data_extents && ri->rt_extents)
+		return -EFSCORRUPTED;
+
+	trace_xrep_dinode_count_rmaps(ri->sc,
+			ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
+			ri->data_extents, ri->rt_extents, ri->attr_extents);
+	return 0;
+}
+
+/* Return true if this extents-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_extents_fork(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	unsigned int		dfork_size,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	new;
+	struct xfs_bmbt_rec	*dp;
+	xfs_extnum_t		nex;
+	bool			isrt;
+	unsigned int		i;
+
+	nex = xfs_dfork_nextents(dip, whichfork);
+	if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
+		return true;
+
+	dp = XFS_DFORK_PTR(dip, whichfork);
+
+	isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
+	for (i = 0; i < nex; i++, dp++) {
+		xfs_failaddr_t	fa;
+
+		xfs_bmbt_disk_get_all(dp, &new);
+		fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
+				&new);
+		if (fa)
+			return true;
+	}
+
+	return false;
+}
+
+/* Return true if this btree-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_bmbt_fork(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	unsigned int		dfork_size,
+	int			whichfork)
+{
+	struct xfs_bmdr_block	*dfp;
+	xfs_extnum_t		nex;
+	unsigned int		i;
+	unsigned int		dmxr;
+	unsigned int		nrecs;
+	unsigned int		level;
+
+	nex = xfs_dfork_nextents(dip, whichfork);
+	if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
+		return true;
+
+	if (dfork_size < sizeof(struct xfs_bmdr_block))
+		return true;
+
+	dfp = XFS_DFORK_PTR(dip, whichfork);
+	nrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
+
+	if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
+		return true;
+	if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
+		return true;
+
+	dmxr = xfs_bmdr_maxrecs(dfork_size, 0);
+	for (i = 1; i <= nrecs; i++) {
+		struct xfs_bmbt_key	*fkp;
+		xfs_bmbt_ptr_t		*fpp;
+		xfs_fileoff_t		fileoff;
+		xfs_fsblock_t		fsbno;
+
+		fkp = XFS_BMDR_KEY_ADDR(dfp, i);
+		fileoff = be64_to_cpu(fkp->br_startoff);
+		if (!xfs_verify_fileoff(sc->mp, fileoff))
+			return true;
+
+		fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
+		fsbno = be64_to_cpu(*fpp);
+		if (!xfs_verify_fsbno(sc->mp, fsbno))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Check the data fork for things that will fail the ifork verifiers or the
+ * ifork formatters.
+ */
+STATIC bool
+xrep_dinode_check_dfork(
+	struct xfs_scrub	*sc,
+	struct xfs_dinode	*dip,
+	uint16_t		mode)
+{
+	void			*dfork_ptr;
+	int64_t			data_size;
+	unsigned int		fmt;
+	unsigned int		dfork_size;
+
+	/*
+	 * Verifier functions take signed int64_t, so check for bogus negative
+	 * values first.
+	 */
+	data_size = be64_to_cpu(dip->di_size);
+	if (data_size < 0)
+		return true;
+
+	fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		if (fmt != XFS_DINODE_FMT_DEV)
+			return true;
+		break;
+	case S_IFREG:
+		if (fmt == XFS_DINODE_FMT_LOCAL)
+			return true;
+		fallthrough;
+	case S_IFLNK:
+	case S_IFDIR:
+		switch (fmt) {
+		case XFS_DINODE_FMT_LOCAL:
+		case XFS_DINODE_FMT_EXTENTS:
+		case XFS_DINODE_FMT_BTREE:
+			break;
+		default:
+			return true;
+		}
+		break;
+	default:
+		return true;
+	}
+
+	dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
+	dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+	switch (fmt) {
+	case XFS_DINODE_FMT_DEV:
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		/* dir/symlink structure cannot be larger than the fork */
+		if (data_size > dfork_size)
+			return true;
+		/* directory structure must pass verification. */
+		if (S_ISDIR(mode) &&
+		    xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
+			return true;
+		/* symlink structure must pass verification. */
+		if (S_ISLNK(mode) &&
+		    xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
+			return true;
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
+				XFS_DATA_FORK))
+			return true;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
+				XFS_DATA_FORK))
+			return true;
+		break;
+	default:
+		return true;
+	}
+
+	return false;
+}
+
+static void
+xrep_dinode_set_data_nextents(
+	struct xfs_dinode	*dip,
+	xfs_extnum_t		nextents)
+{
+	if (xfs_dinode_has_large_extent_counts(dip))
+		dip->di_big_nextents = cpu_to_be64(nextents);
+	else
+		dip->di_nextents = cpu_to_be32(nextents);
+}
+
+static void
+xrep_dinode_set_attr_nextents(
+	struct xfs_dinode	*dip,
+	xfs_extnum_t		nextents)
+{
+	if (xfs_dinode_has_large_extent_counts(dip))
+		dip->di_big_anextents = cpu_to_be32(nextents);
+	else
+		dip->di_anextents = cpu_to_be16(nextents);
+}
+
+/* Reset the data fork to something sane. */
+STATIC void
+xrep_dinode_zap_dfork(
+	struct xrep_inode	*ri,
+	struct xfs_dinode	*dip,
+	uint16_t		mode)
+{
+	struct xfs_scrub	*sc = ri->sc;
+
+	trace_xrep_dinode_zap_dfork(sc, dip);
+
+	ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED;
+
+	xrep_dinode_set_data_nextents(dip, 0);
+	ri->data_blocks = 0;
+	ri->rt_blocks = 0;
+
+	/* Special files always get reset to DEV */
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFSOCK:
+		dip->di_format = XFS_DINODE_FMT_DEV;
+		dip->di_size = 0;
+		return;
+	}
+
+	/*
+	 * If we have data extents, reset to an empty map and hope the user
+	 * will run the bmapbtd checker next.
+	 */
+	if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) {
+		dip->di_format = XFS_DINODE_FMT_EXTENTS;
+		return;
+	}
+
+	/* Otherwise, reset the local format to the minimum. */
+	switch (mode & S_IFMT) {
+	case S_IFLNK:
+		xrep_dinode_zap_symlink(ri, dip);
+		break;
+	case S_IFDIR:
+		xrep_dinode_zap_dir(ri, dip);
+		break;
+	}
+}
+
+/*
+ * Check the attr fork for things that will fail the ifork verifiers or the
+ * ifork formatters.
+ */
+STATIC bool
+xrep_dinode_check_afork(
+	struct xfs_scrub		*sc,
+	struct xfs_dinode		*dip)
+{
+	struct xfs_attr_sf_hdr		*afork_ptr;
+	size_t				attr_size;
+	unsigned int			afork_size;
+
+	if (XFS_DFORK_BOFF(dip) == 0)
+		return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
+		       xfs_dfork_attr_extents(dip) != 0;
+
+	afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
+	afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
+
+	switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
+	case XFS_DINODE_FMT_LOCAL:
+		/* Fork has to be large enough to extract the xattr size. */
+		if (afork_size < sizeof(struct xfs_attr_sf_hdr))
+			return true;
+
+		/* xattr structure cannot be larger than the fork */
+		attr_size = be16_to_cpu(afork_ptr->totsize);
+		if (attr_size > afork_size)
+			return true;
+
+		/* xattr structure must pass verification. */
+		return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
+	case XFS_DINODE_FMT_EXTENTS:
+		if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
+					XFS_ATTR_FORK))
+			return true;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
+					XFS_ATTR_FORK))
+			return true;
+		break;
+	default:
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Reset the attr fork to empty.  Since the attr fork could have contained
+ * ACLs, make the file readable only by root.
+ */
+STATIC void
+xrep_dinode_zap_afork(
+	struct xrep_inode	*ri,
+	struct xfs_dinode	*dip,
+	uint16_t		mode)
+{
+	struct xfs_scrub	*sc = ri->sc;
+
+	trace_xrep_dinode_zap_afork(sc, dip);
+
+	ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED;
+
+	dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
+	xrep_dinode_set_attr_nextents(dip, 0);
+	ri->attr_blocks = 0;
+
+	/*
+	 * If the data fork is in btree format, removing the attr fork entirely
+	 * might cause verifier failures if the next level down in the bmbt
+	 * could now fit in the data fork area.
+	 */
+	if (dip->di_format != XFS_DINODE_FMT_BTREE)
+		dip->di_forkoff = 0;
+	dip->di_mode = cpu_to_be16(mode & ~0777);
+	dip->di_uid = 0;
+	dip->di_gid = 0;
+}
+
+/* Make sure the fork offset is a sensible value. */
+STATIC void
+xrep_dinode_ensure_forkoff(
+	struct xrep_inode	*ri,
+	struct xfs_dinode	*dip,
+	uint16_t		mode)
+{
+	struct xfs_bmdr_block	*bmdr;
+	struct xfs_scrub	*sc = ri->sc;
+	xfs_extnum_t		attr_extents, data_extents;
+	size_t			bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
+	unsigned int		lit_sz = XFS_LITINO(sc->mp);
+	unsigned int		afork_min, dfork_min;
+
+	trace_xrep_dinode_ensure_forkoff(sc, dip);
+
+	/*
+	 * Before calling this function, xrep_dinode_core ensured that both
+	 * forks actually fit inside their respective literal areas.  If this
+	 * was not the case, the fork was reset to FMT_EXTENTS with zero
+	 * records.  If the rmapbt scan found attr or data fork blocks, this
+	 * will be noted in the dinode_stats, and we must leave enough room
+	 * for the bmap repair code to reconstruct the mapping structure.
+	 *
+	 * First, compute the minimum space required for the attr fork.
+	 */
+	switch (dip->di_aformat) {
+	case XFS_DINODE_FMT_LOCAL:
+		/*
+		 * If we still have a shortform xattr structure at all, that
+		 * means the attr fork area was exactly large enough to fit
+		 * the sf structure.
+		 */
+		afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		attr_extents = xfs_dfork_attr_extents(dip);
+		if (attr_extents) {
+			/*
+			 * We must maintain sufficient space to hold the entire
+			 * extent map array in the data fork.  Note that we
+			 * previously zapped the fork if it had no chance of
+			 * fitting in the inode.
+			 */
+			afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
+		} else if (ri->attr_extents > 0) {
+			/*
+			 * The attr fork thinks it has zero extents, but we
+			 * found some xattr extents.  We need to leave enough
+			 * empty space here so that the incore attr fork will
+			 * get created (and hence trigger the attr fork bmap
+			 * repairer).
+			 */
+			afork_min = bmdr_minsz;
+		} else {
+			/* No extents on disk or found in rmapbt. */
+			afork_min = 0;
+		}
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		/* Must have space for btree header and key/pointers. */
+		bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
+		afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+		break;
+	default:
+		/* We should never see any other formats. */
+		afork_min = 0;
+		break;
+	}
+
+	/* Compute the minimum space required for the data fork. */
+	switch (dip->di_format) {
+	case XFS_DINODE_FMT_DEV:
+		dfork_min = sizeof(__be32);
+		break;
+	case XFS_DINODE_FMT_UUID:
+		dfork_min = sizeof(uuid_t);
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		/*
+		 * If we still have a shortform data fork at all, that means
+		 * the data fork area was large enough to fit whatever was in
+		 * there.
+		 */
+		dfork_min = be64_to_cpu(dip->di_size);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		data_extents = xfs_dfork_data_extents(dip);
+		if (data_extents) {
+			/*
+			 * We must maintain sufficient space to hold the entire
+			 * extent map array in the data fork.  Note that we
+			 * previously zapped the fork if it had no chance of
+			 * fitting in the inode.
+			 */
+			dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
+		} else if (ri->data_extents > 0 || ri->rt_extents > 0) {
+			/*
+			 * The data fork thinks it has zero extents, but we
+			 * found some data extents.  We need to leave enough
+			 * empty space here so that the data fork bmap repair
+			 * will recover the mappings.
+			 */
+			dfork_min = bmdr_minsz;
+		} else {
+			/* No extents on disk or found in rmapbt. */
+			dfork_min = 0;
+		}
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		/* Must have space for btree header and key/pointers. */
+		bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+		dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+		break;
+	default:
+		dfork_min = 0;
+		break;
+	}
+
+	/*
+	 * Round all values up to the nearest 8 bytes, because that is the
+	 * precision of di_forkoff.
+	 */
+	afork_min = roundup(afork_min, 8);
+	dfork_min = roundup(dfork_min, 8);
+	bmdr_minsz = roundup(bmdr_minsz, 8);
+
+	ASSERT(dfork_min <= lit_sz);
+	ASSERT(afork_min <= lit_sz);
+
+	/*
+	 * If the data fork was zapped and we don't have enough space for the
+	 * recovery fork, move the attr fork up.
+	 */
+	if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
+	    xfs_dfork_data_extents(dip) == 0 &&
+	    (ri->data_extents > 0 || ri->rt_extents > 0) &&
+	    bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
+		if (bmdr_minsz + afork_min > lit_sz) {
+			/*
+			 * The attr for and the stub fork we need to recover
+			 * the data fork won't both fit.  Zap the attr fork.
+			 */
+			xrep_dinode_zap_afork(ri, dip, mode);
+			afork_min = bmdr_minsz;
+		} else {
+			void	*before, *after;
+
+			/* Otherwise, just slide the attr fork up. */
+			before = XFS_DFORK_APTR(dip);
+			dip->di_forkoff = bmdr_minsz >> 3;
+			after = XFS_DFORK_APTR(dip);
+			memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
+		}
+	}
+
+	/*
+	 * If the attr fork was zapped and we don't have enough space for the
+	 * recovery fork, move the attr fork down.
+	 */
+	if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	    xfs_dfork_attr_extents(dip) == 0 &&
+	    ri->attr_extents > 0 &&
+	    bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
+		if (dip->di_format == XFS_DINODE_FMT_BTREE) {
+			/*
+			 * If the data fork is in btree format then we can't
+			 * adjust forkoff because that runs the risk of
+			 * violating the extents/btree format transition rules.
+			 */
+		} else if (bmdr_minsz + dfork_min > lit_sz) {
+			/*
+			 * If we can't move the attr fork, too bad, we lose the
+			 * attr fork and leak its blocks.
+			 */
+			xrep_dinode_zap_afork(ri, dip, mode);
+		} else {
+			/*
+			 * Otherwise, just slide the attr fork down.  The attr
+			 * fork is empty, so we don't have any old contents to
+			 * move here.
+			 */
+			dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3;
+		}
+	}
+}
+
+/*
+ * Zap the data/attr forks if we spot anything that isn't going to pass the
+ * ifork verifiers or the ifork formatters, because we need to get the inode
+ * into good enough shape that the higher level repair functions can run.
+ */
+STATIC void
+xrep_dinode_zap_forks(
+	struct xrep_inode	*ri,
+	struct xfs_dinode	*dip)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	xfs_extnum_t		data_extents;
+	xfs_extnum_t		attr_extents;
+	xfs_filblks_t		nblocks;
+	uint16_t		mode;
+	bool			zap_datafork = false;
+	bool			zap_attrfork = ri->zap_acls;
+
+	trace_xrep_dinode_zap_forks(sc, dip);
+
+	mode = be16_to_cpu(dip->di_mode);
+
+	data_extents = xfs_dfork_data_extents(dip);
+	attr_extents = xfs_dfork_attr_extents(dip);
+	nblocks = be64_to_cpu(dip->di_nblocks);
+
+	/* Inode counters don't make sense? */
+	if (data_extents > nblocks)
+		zap_datafork = true;
+	if (attr_extents > nblocks)
+		zap_attrfork = true;
+	if (data_extents + attr_extents > nblocks)
+		zap_datafork = zap_attrfork = true;
+
+	if (!zap_datafork)
+		zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
+	if (!zap_attrfork)
+		zap_attrfork = xrep_dinode_check_afork(sc, dip);
+
+	/* Zap whatever's bad. */
+	if (zap_attrfork)
+		xrep_dinode_zap_afork(ri, dip, mode);
+	if (zap_datafork)
+		xrep_dinode_zap_dfork(ri, dip, mode);
+	xrep_dinode_ensure_forkoff(ri, dip, mode);
+
+	/*
+	 * Zero di_nblocks if we don't have any extents at all to satisfy the
+	 * buffer verifier.
+	 */
+	data_extents = xfs_dfork_data_extents(dip);
+	attr_extents = xfs_dfork_attr_extents(dip);
+	if (data_extents + attr_extents == 0)
+		dip->di_nblocks = 0;
+}
+
+/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */
+STATIC int
+xrep_dinode_core(
+	struct xrep_inode	*ri)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	struct xfs_buf		*bp;
+	struct xfs_dinode	*dip;
+	xfs_ino_t		ino = sc->sm->sm_ino;
+	int			error;
+	int			iget_error;
+
+	/* Figure out what this inode had mapped in both forks. */
+	error = xrep_dinode_count_rmaps(ri);
+	if (error)
+		return error;
+
+	/* Read the inode cluster buffer. */
+	error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
+			ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
+			NULL);
+	if (error)
+		return error;
+
+	/* Make sure we can pass the inode buffer verifier. */
+	xrep_dinode_buf(sc, bp);
+	bp->b_ops = &xfs_inode_buf_ops;
+
+	/* Fix everything the verifier will complain about. */
+	dip = xfs_buf_offset(bp, ri->imap.im_boffset);
+	xrep_dinode_header(sc, dip);
+	xrep_dinode_mode(ri, dip);
+	xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
+	xrep_dinode_size(ri, dip);
+	xrep_dinode_extsize_hints(sc, dip);
+	xrep_dinode_zap_forks(ri, dip);
+
+	/* Write out the inode. */
+	trace_xrep_dinode_fixed(sc, dip);
+	xfs_dinode_calc_crc(sc->mp, dip);
+	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
+	xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
+			ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
+
+	/*
+	 * In theory, we've fixed the ondisk inode record enough that we should
+	 * be able to load the inode into the cache.  Try to iget that inode
+	 * now while we hold the AGI and the inode cluster buffer and take the
+	 * IOLOCK so that we can continue with repairs without anyone else
+	 * accessing the inode.  If iget fails, we still need to commit the
+	 * changes.
+	 */
+	iget_error = xchk_iget(sc, ino, &sc->ip);
+	if (!iget_error)
+		xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+	/*
+	 * Commit the inode cluster buffer updates and drop the AGI buffer that
+	 * we've been holding since scrub setup.  From here on out, repairs
+	 * deal only with the cached inode.
+	 */
+	error = xrep_trans_commit(sc);
+	if (error)
+		return error;
+
+	if (iget_error)
+		return iget_error;
+
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		return error;
+
+	error = xrep_ino_dqattach(sc);
+	if (error)
+		return error;
+
+	xchk_ilock(sc, XFS_ILOCK_EXCL);
+	if (ri->ino_sick_mask)
+		xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
+	return 0;
+}
+
+/* Fix everything xfs_dinode_verify cares about. */
+STATIC int
+xrep_dinode_problems(
+	struct xrep_inode	*ri)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	int			error;
+
+	error = xrep_dinode_core(ri);
+	if (error)
+		return error;
+
+	/* We had to fix a totally busted inode, schedule quotacheck. */
+	if (XFS_IS_UQUOTA_ON(sc->mp))
+		xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+	if (XFS_IS_GQUOTA_ON(sc->mp))
+		xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+	if (XFS_IS_PQUOTA_ON(sc->mp))
+		xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+
+	return 0;
+}
+
+/*
+ * Fix problems that the verifiers don't care about.  In general these are
+ * errors that don't cause problems elsewhere in the kernel that we can easily
+ * detect, so we don't check them all that rigorously.
+ */
+
+/* Make sure block and extent counts are ok. */
+STATIC int
+xrep_inode_blockcounts(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_ifork	*ifp;
+	xfs_filblks_t		count;
+	xfs_filblks_t		acount;
+	xfs_extnum_t		nextents;
+	int			error;
+
+	trace_xrep_inode_blockcounts(sc);
+
+	/* Set data fork counters from the data fork mappings. */
+	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
+			&nextents, &count);
+	if (error)
+		return error;
+	if (xfs_is_reflink_inode(sc->ip)) {
+		/*
+		 * data fork blockcount can exceed physical storage if a user
+		 * reflinks the same block over and over again.
+		 */
+		;
+	} else if (XFS_IS_REALTIME_INODE(sc->ip)) {
+		if (count >= sc->mp->m_sb.sb_rblocks)
+			return -EFSCORRUPTED;
+	} else {
+		if (count >= sc->mp->m_sb.sb_dblocks)
+			return -EFSCORRUPTED;
+	}
+	error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
+	if (error)
+		return error;
+	sc->ip->i_df.if_nextents = nextents;
+
+	/* Set attr fork counters from the attr fork mappings. */
+	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+	if (ifp) {
+		error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
+				&nextents, &acount);
+		if (error)
+			return error;
+		if (count >= sc->mp->m_sb.sb_dblocks)
+			return -EFSCORRUPTED;
+		error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
+				nextents);
+		if (error)
+			return error;
+		ifp->if_nextents = nextents;
+	} else {
+		acount = 0;
+	}
+
+	sc->ip->i_nblocks = count + acount;
+	return 0;
+}
+
+/* Check for invalid uid/gid/prid. */
+STATIC void
+xrep_inode_ids(
+	struct xfs_scrub	*sc)
+{
+	bool			dirty = false;
+
+	trace_xrep_inode_ids(sc);
+
+	if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
+		i_uid_write(VFS_I(sc->ip), 0);
+		dirty = true;
+		if (XFS_IS_UQUOTA_ON(sc->mp))
+			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+	}
+
+	if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
+		i_gid_write(VFS_I(sc->ip), 0);
+		dirty = true;
+		if (XFS_IS_GQUOTA_ON(sc->mp))
+			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+	}
+
+	if (sc->ip->i_projid == -1U) {
+		sc->ip->i_projid = 0;
+		dirty = true;
+		if (XFS_IS_PQUOTA_ON(sc->mp))
+			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+	}
+
+	/* strip setuid/setgid if we touched any of the ids */
+	if (dirty)
+		VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
+}
+
+static inline void
+xrep_clamp_timestamp(
+	struct xfs_inode	*ip,
+	struct timespec64	*ts)
+{
+	ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
+	*ts = timestamp_truncate(*ts, VFS_I(ip));
+}
+
+/* Nanosecond counters can't have more than 1 billion. */
+STATIC void
+xrep_inode_timestamps(
+	struct xfs_inode	*ip)
+{
+	struct timespec64	tstamp;
+	struct inode		*inode = VFS_I(ip);
+
+	tstamp = inode_get_atime(inode);
+	xrep_clamp_timestamp(ip, &tstamp);
+	inode_set_atime_to_ts(inode, tstamp);
+
+	tstamp = inode_get_mtime(inode);
+	xrep_clamp_timestamp(ip, &tstamp);
+	inode_set_mtime_to_ts(inode, tstamp);
+
+	tstamp = inode_get_ctime(inode);
+	xrep_clamp_timestamp(ip, &tstamp);
+	inode_set_ctime_to_ts(inode, tstamp);
+
+	xrep_clamp_timestamp(ip, &ip->i_crtime);
+}
+
+/* Fix inode flags that don't make sense together. */
+STATIC void
+xrep_inode_flags(
+	struct xfs_scrub	*sc)
+{
+	uint16_t		mode;
+
+	trace_xrep_inode_flags(sc);
+
+	mode = VFS_I(sc->ip)->i_mode;
+
+	/* Clear junk flags */
+	if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
+		sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
+
+	/* NEWRTBM only applies to realtime bitmaps */
+	if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
+		sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+	else
+		sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
+
+	/* These only make sense for directories. */
+	if (!S_ISDIR(mode))
+		sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
+					  XFS_DIFLAG_EXTSZINHERIT |
+					  XFS_DIFLAG_PROJINHERIT |
+					  XFS_DIFLAG_NOSYMLINKS);
+
+	/* These only make sense for files. */
+	if (!S_ISREG(mode))
+		sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
+					  XFS_DIFLAG_EXTSIZE);
+
+	/* These only make sense for non-rt files. */
+	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
+		sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
+
+	/* Immutable and append only?  Drop the append. */
+	if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
+	    (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
+		sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
+
+	/* Clear junk flags. */
+	if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
+
+	/* No reflink flag unless we support it and it's a file. */
+	if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+	/* DAX only applies to files and dirs. */
+	if (!(S_ISREG(mode) || S_ISDIR(mode)))
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
+
+	/* No reflink files on the realtime device. */
+	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+}
+
+/*
+ * Fix size problems with block/node format directories.  If we fail to find
+ * the extent list, just bail out and let the bmapbtd repair functions clean
+ * up that mess.
+ */
+STATIC void
+xrep_inode_blockdir_size(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+	struct xfs_ifork	*ifp;
+	xfs_fileoff_t		off;
+	int			error;
+
+	trace_xrep_inode_blockdir_size(sc);
+
+	error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
+	if (error)
+		return;
+
+	/* Find the last block before 32G; this is the dir size. */
+	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+	off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
+	if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
+		/* zero-extents directory? */
+		return;
+	}
+
+	off = got.br_startoff + got.br_blockcount;
+	sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
+			XFS_FSB_TO_B(sc->mp, off));
+}
+
+/* Fix size problems with short format directories. */
+STATIC void
+xrep_inode_sfdir_size(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_ifork	*ifp;
+
+	trace_xrep_inode_sfdir_size(sc);
+
+	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+	sc->ip->i_disk_size = ifp->if_bytes;
+}
+
+/*
+ * Fix any irregularities in a directory inode's size now that we can iterate
+ * extent maps and access other regular inode data.
+ */
+STATIC void
+xrep_inode_dir_size(
+	struct xfs_scrub	*sc)
+{
+	trace_xrep_inode_dir_size(sc);
+
+	switch (sc->ip->i_df.if_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		xrep_inode_blockdir_size(sc);
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+		xrep_inode_sfdir_size(sc);
+		break;
+	}
+}
+
+/* Fix extent size hint problems. */
+STATIC void
+xrep_inode_extsize(
+	struct xfs_scrub	*sc)
+{
+	/* Fix misaligned extent size hints on a directory. */
+	if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+	    (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+	    xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) {
+		sc->ip->i_extsize = 0;
+		sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
+	}
+}
+
+/* Fix any irregularities in an inode that the verifiers don't catch. */
+STATIC int
+xrep_inode_problems(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	error = xrep_inode_blockcounts(sc);
+	if (error)
+		return error;
+	xrep_inode_timestamps(sc->ip);
+	xrep_inode_flags(sc);
+	xrep_inode_ids(sc);
+	/*
+	 * We can now do a better job fixing the size of a directory now that
+	 * we can scan the data fork extents than we could in xrep_dinode_size.
+	 */
+	if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+		xrep_inode_dir_size(sc);
+	xrep_inode_extsize(sc);
+
+	trace_xrep_inode_fixed(sc);
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	return xrep_roll_trans(sc);
+}
+
+/* Repair an inode's fields. */
+int
+xrep_inode(
+	struct xfs_scrub	*sc)
+{
+	int			error = 0;
+
+	/*
+	 * No inode?  That means we failed the _iget verifiers.  Repair all
+	 * the things that the inode verifiers care about, then retry _iget.
+	 */
+	if (!sc->ip) {
+		struct xrep_inode	*ri = sc->buf;
+
+		ASSERT(ri != NULL);
+
+		error = xrep_dinode_problems(ri);
+		if (error)
+			return error;
+
+		/* By this point we had better have a working incore inode. */
+		if (!sc->ip)
+			return -EFSCORRUPTED;
+	}
+
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	/* If we found corruption of any kind, try to fix it. */
+	if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
+	    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
+		error = xrep_inode_problems(sc);
+		if (error)
+			return error;
+	}
+
+	/* See if we can clear the reflink flag. */
+	if (xfs_is_reflink_inode(sc->ip)) {
+		error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+		if (error)
+			return error;
+	}
+
+	return xrep_defer_finish(sc);
+}
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
new file mode 100644
index 000000000000..bb6d980b4fcd
--- /dev/null
+++ b/fs/xfs/scrub/newbt.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_defer.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/newbt.h"
+
+/*
+ * Estimate proper slack values for a btree that's being reloaded.
+ *
+ * Under most circumstances, we'll take whatever default loading value the
+ * btree bulk loading code calculates for us.  However, there are some
+ * exceptions to this rule:
+ *
+ * (0) If someone turned one of the debug knobs.
+ * (1) If this is a per-AG btree and the AG has less than 10% space free.
+ * (2) If this is an inode btree and the FS has less than 10% space free.
+
+ * In either case, format the new btree blocks almost completely full to
+ * minimize space usage.
+ */
+static void
+xrep_newbt_estimate_slack(
+	struct xrep_newbt	*xnr)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+	struct xfs_btree_bload	*bload = &xnr->bload;
+	uint64_t		free;
+	uint64_t		sz;
+
+	/*
+	 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
+	 * unless someone has set them otherwise, so we just pull the values
+	 * here.
+	 */
+	bload->leaf_slack = xfs_globals.bload_leaf_slack;
+	bload->node_slack = xfs_globals.bload_node_slack;
+
+	if (sc->ops->type == ST_PERAG) {
+		free = sc->sa.pag->pagf_freeblks;
+		sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
+	} else {
+		free = percpu_counter_sum(&sc->mp->m_fdblocks);
+		sz = sc->mp->m_sb.sb_dblocks;
+	}
+
+	/* No further changes if there's more than 10% free space left. */
+	if (free >= div_u64(sz, 10))
+		return;
+
+	/*
+	 * We're low on space; load the btrees as tightly as possible.  Leave
+	 * a couple of open slots in each btree block so that we don't end up
+	 * splitting the btrees like crazy after a mount.
+	 */
+	if (bload->leaf_slack < 0)
+		bload->leaf_slack = 2;
+	if (bload->node_slack < 0)
+		bload->node_slack = 2;
+}
+
+/* Initialize accounting resources for staging a new AG btree. */
+void
+xrep_newbt_init_ag(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc,
+	const struct xfs_owner_info	*oinfo,
+	xfs_fsblock_t			alloc_hint,
+	enum xfs_ag_resv_type		resv)
+{
+	memset(xnr, 0, sizeof(struct xrep_newbt));
+	xnr->sc = sc;
+	xnr->oinfo = *oinfo; /* structure copy */
+	xnr->alloc_hint = alloc_hint;
+	xnr->resv = resv;
+	INIT_LIST_HEAD(&xnr->resv_list);
+	xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
+	xrep_newbt_estimate_slack(xnr);
+}
+
+/* Initialize accounting resources for staging a new inode fork btree. */
+int
+xrep_newbt_init_inode(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc,
+	int				whichfork,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xfs_ifork		*ifp;
+
+	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
+	if (!ifp)
+		return -ENOMEM;
+
+	xrep_newbt_init_ag(xnr, sc, oinfo,
+			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+			XFS_AG_RESV_NONE);
+	xnr->ifake.if_fork = ifp;
+	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
+	return 0;
+}
+
+/*
+ * Initialize accounting resources for staging a new btree.  Callers are
+ * expected to add their own reservations (and clean them up) manually.
+ */
+void
+xrep_newbt_init_bare(
+	struct xrep_newbt		*xnr,
+	struct xfs_scrub		*sc)
+{
+	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
+			XFS_AG_RESV_NONE);
+}
+
+/*
+ * Designate specific blocks to be used to build our new btree.  @pag must be
+ * a passive reference.
+ */
+STATIC int
+xrep_newbt_add_blocks(
+	struct xrep_newbt		*xnr,
+	struct xfs_perag		*pag,
+	const struct xfs_alloc_arg	*args)
+{
+	struct xfs_mount		*mp = xnr->sc->mp;
+	struct xrep_newbt_resv		*resv;
+	int				error;
+
+	resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
+	if (!resv)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&resv->list);
+	resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+	resv->len = args->len;
+	resv->used = 0;
+	resv->pag = xfs_perag_hold(pag);
+
+	if (args->tp) {
+		ASSERT(xnr->oinfo.oi_offset == 0);
+
+		error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
+		if (error)
+			goto out_pag;
+	}
+
+	list_add_tail(&resv->list, &xnr->resv_list);
+	return 0;
+out_pag:
+	xfs_perag_put(resv->pag);
+	kfree(resv);
+	return error;
+}
+
+/*
+ * Add an extent to the new btree reservation pool.  Callers are required to
+ * reap this reservation manually if the repair is cancelled.  @pag must be a
+ * passive reference.
+ */
+int
+xrep_newbt_add_extent(
+	struct xrep_newbt	*xnr,
+	struct xfs_perag	*pag,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len)
+{
+	struct xfs_mount	*mp = xnr->sc->mp;
+	struct xfs_alloc_arg	args = {
+		.tp		= NULL, /* no autoreap */
+		.oinfo		= xnr->oinfo,
+		.fsbno		= XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
+		.len		= len,
+		.resv		= xnr->resv,
+	};
+
+	return xrep_newbt_add_blocks(xnr, pag, &args);
+}
+
+/* Don't let our allocation hint take us beyond this AG */
+static inline void
+xrep_newbt_validate_ag_alloc_hint(
+	struct xrep_newbt	*xnr)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
+
+	if (agno == sc->sa.pag->pag_agno &&
+	    xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
+		return;
+
+	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+					 XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for a new per-AG btree. */
+STATIC int
+xrep_newbt_alloc_ag_blocks(
+	struct xrep_newbt	*xnr,
+	uint64_t		nr_blocks)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+	struct xfs_mount	*mp = sc->mp;
+	int			error = 0;
+
+	ASSERT(sc->sa.pag != NULL);
+
+	while (nr_blocks > 0) {
+		struct xfs_alloc_arg	args = {
+			.tp		= sc->tp,
+			.mp		= mp,
+			.oinfo		= xnr->oinfo,
+			.minlen		= 1,
+			.maxlen		= nr_blocks,
+			.prod		= 1,
+			.resv		= xnr->resv,
+		};
+		xfs_agnumber_t		agno;
+
+		xrep_newbt_validate_ag_alloc_hint(xnr);
+
+		error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
+		if (error)
+			return error;
+		if (args.fsbno == NULLFSBLOCK)
+			return -ENOSPC;
+
+		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+		trace_xrep_newbt_alloc_ag_blocks(mp, agno,
+				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+				xnr->oinfo.oi_owner);
+
+		if (agno != sc->sa.pag->pag_agno) {
+			ASSERT(agno == sc->sa.pag->pag_agno);
+			return -EFSCORRUPTED;
+		}
+
+		error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
+		if (error)
+			return error;
+
+		nr_blocks -= args.len;
+		xnr->alloc_hint = args.fsbno + args.len;
+
+		error = xrep_defer_finish(sc);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/* Don't let our allocation hint take us beyond EOFS */
+static inline void
+xrep_newbt_validate_file_alloc_hint(
+	struct xrep_newbt	*xnr)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+
+	if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
+		return;
+
+	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for our new file-based btree. */
+STATIC int
+xrep_newbt_alloc_file_blocks(
+	struct xrep_newbt	*xnr,
+	uint64_t		nr_blocks)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+	struct xfs_mount	*mp = sc->mp;
+	int			error = 0;
+
+	while (nr_blocks > 0) {
+		struct xfs_alloc_arg	args = {
+			.tp		= sc->tp,
+			.mp		= mp,
+			.oinfo		= xnr->oinfo,
+			.minlen		= 1,
+			.maxlen		= nr_blocks,
+			.prod		= 1,
+			.resv		= xnr->resv,
+		};
+		struct xfs_perag	*pag;
+		xfs_agnumber_t		agno;
+
+		xrep_newbt_validate_file_alloc_hint(xnr);
+
+		error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
+		if (error)
+			return error;
+		if (args.fsbno == NULLFSBLOCK)
+			return -ENOSPC;
+
+		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+		trace_xrep_newbt_alloc_file_blocks(mp, agno,
+				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+				xnr->oinfo.oi_owner);
+
+		pag = xfs_perag_get(mp, agno);
+		if (!pag) {
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+
+		error = xrep_newbt_add_blocks(xnr, pag, &args);
+		xfs_perag_put(pag);
+		if (error)
+			return error;
+
+		nr_blocks -= args.len;
+		xnr->alloc_hint = args.fsbno + args.len;
+
+		error = xrep_defer_finish(sc);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/* Allocate disk space for our new btree. */
+int
+xrep_newbt_alloc_blocks(
+	struct xrep_newbt	*xnr,
+	uint64_t		nr_blocks)
+{
+	if (xnr->sc->ip)
+		return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
+	return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
+}
+
+/*
+ * Free the unused part of a space extent that was reserved for a new ondisk
+ * structure.  Returns the number of EFIs logged or a negative errno.
+ */
+STATIC int
+xrep_newbt_free_extent(
+	struct xrep_newbt	*xnr,
+	struct xrep_newbt_resv	*resv,
+	bool			btree_committed)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+	xfs_agblock_t		free_agbno = resv->agbno;
+	xfs_extlen_t		free_aglen = resv->len;
+	xfs_fsblock_t		fsbno;
+	int			error;
+
+	if (!btree_committed || resv->used == 0) {
+		/*
+		 * If we're not committing a new btree or we didn't use the
+		 * space reservation, let the existing EFI free the entire
+		 * space extent.
+		 */
+		trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
+				free_agbno, free_aglen, xnr->oinfo.oi_owner);
+		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
+		return 1;
+	}
+
+	/*
+	 * We used space and committed the btree.  Cancel the autoreap, remove
+	 * the written blocks from the reservation, and possibly log a new EFI
+	 * to free any unused reservation space.
+	 */
+	xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
+	free_agbno += resv->used;
+	free_aglen -= resv->used;
+
+	if (free_aglen == 0)
+		return 0;
+
+	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
+			free_aglen, xnr->oinfo.oi_owner);
+
+	ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
+	ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
+
+	/*
+	 * Use EFIs to free the reservations.  This reduces the chance
+	 * that we leak blocks if the system goes down.
+	 */
+	fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
+	error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
+			xnr->resv, true);
+	if (error)
+		return error;
+
+	return 1;
+}
+
+/* Free all the accounting info and disk space we reserved for a new btree. */
+STATIC int
+xrep_newbt_free(
+	struct xrep_newbt	*xnr,
+	bool			btree_committed)
+{
+	struct xfs_scrub	*sc = xnr->sc;
+	struct xrep_newbt_resv	*resv, *n;
+	unsigned int		freed = 0;
+	int			error = 0;
+
+	/*
+	 * If the filesystem already went down, we can't free the blocks.  Skip
+	 * ahead to freeing the incore metadata because we can't fix anything.
+	 */
+	if (xfs_is_shutdown(sc->mp))
+		goto junkit;
+
+	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+		int		ret;
+
+		ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
+		list_del(&resv->list);
+		xfs_perag_put(resv->pag);
+		kfree(resv);
+		if (ret < 0) {
+			error = ret;
+			goto junkit;
+		}
+
+		freed += ret;
+		if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
+			error = xrep_defer_finish(sc);
+			if (error)
+				goto junkit;
+			freed = 0;
+		}
+	}
+
+	if (freed)
+		error = xrep_defer_finish(sc);
+
+junkit:
+	/*
+	 * If we still have reservations attached to @newbt, cleanup must have
+	 * failed and the filesystem is about to go down.  Clean up the incore
+	 * reservations and try to commit to freeing the space we used.
+	 */
+	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
+		list_del(&resv->list);
+		xfs_perag_put(resv->pag);
+		kfree(resv);
+	}
+
+	if (sc->ip) {
+		kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
+		xnr->ifake.if_fork = NULL;
+	}
+
+	return error;
+}
+
+/*
+ * Free all the accounting info and unused disk space allocations after
+ * committing a new btree.
+ */
+int
+xrep_newbt_commit(
+	struct xrep_newbt	*xnr)
+{
+	return xrep_newbt_free(xnr, true);
+}
+
+/*
+ * Free all the accounting info and all of the disk space we reserved for a new
+ * btree that we're not going to commit.  We want to try to roll things back
+ * cleanly for things like ENOSPC midway through allocation.
+ */
+void
+xrep_newbt_cancel(
+	struct xrep_newbt	*xnr)
+{
+	xrep_newbt_free(xnr, false);
+}
+
+/* Feed one of the reserved btree blocks to the bulk loader. */
+int
+xrep_newbt_claim_block(
+	struct xfs_btree_cur	*cur,
+	struct xrep_newbt	*xnr,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xrep_newbt_resv	*resv;
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_agblock_t		agbno;
+
+	/*
+	 * The first item in the list should always have a free block unless
+	 * we're completely out.
+	 */
+	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
+	if (resv->used == resv->len)
+		return -ENOSPC;
+
+	/*
+	 * Peel off a block from the start of the reservation.  We allocate
+	 * blocks in order to place blocks on disk in increasing record or key
+	 * order.  The block reservations tend to end up on the list in
+	 * decreasing order, which hopefully results in leaf blocks ending up
+	 * together.
+	 */
+	agbno = resv->agbno + resv->used;
+	resv->used++;
+
+	/* If we used all the blocks in this reservation, move it to the end. */
+	if (resv->used == resv->len)
+		list_move_tail(&resv->list, &xnr->resv_list);
+
+	trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
+			xnr->oinfo.oi_owner);
+
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
+								agbno));
+	else
+		ptr->s = cpu_to_be32(agbno);
+
+	/* Relog all the EFIs. */
+	return xrep_defer_finish(xnr->sc);
+}
+
+/* How many reserved blocks are unused? */
+unsigned int
+xrep_newbt_unused_blocks(
+	struct xrep_newbt	*xnr)
+{
+	struct xrep_newbt_resv	*resv;
+	unsigned int		unused = 0;
+
+	list_for_each_entry(resv, &xnr->resv_list, list)
+		unused += resv->len - resv->used;
+	return unused;
+}
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
new file mode 100644
index 000000000000..89f8e3970b1f
--- /dev/null
+++ b/fs/xfs/scrub/newbt.h
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_NEWBT_H__
+#define __XFS_SCRUB_NEWBT_H__
+
+struct xrep_newbt_resv {
+	/* Link to list of extents that we've reserved. */
+	struct list_head	list;
+
+	struct xfs_perag	*pag;
+
+	/* Auto-freeing this reservation if we don't commit. */
+	struct xfs_alloc_autoreap autoreap;
+
+	/* AG block of the extent we reserved. */
+	xfs_agblock_t		agbno;
+
+	/* Length of the reservation. */
+	xfs_extlen_t		len;
+
+	/* How much of this reservation has been used. */
+	xfs_extlen_t		used;
+};
+
+struct xrep_newbt {
+	struct xfs_scrub	*sc;
+
+	/* List of extents that we've reserved. */
+	struct list_head	resv_list;
+
+	/* Fake root for new btree. */
+	union {
+		struct xbtree_afakeroot	afake;
+		struct xbtree_ifakeroot	ifake;
+	};
+
+	/* rmap owner of these blocks */
+	struct xfs_owner_info	oinfo;
+
+	/* btree geometry for the bulk loader */
+	struct xfs_btree_bload	bload;
+
+	/* Allocation hint */
+	xfs_fsblock_t		alloc_hint;
+
+	/* per-ag reservation type */
+	enum xfs_ag_resv_type	resv;
+};
+
+void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc);
+void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+		const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint,
+		enum xfs_ag_resv_type resv);
+int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+		int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
+int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag,
+		xfs_agblock_t agbno, xfs_extlen_t len);
+void xrep_newbt_cancel(struct xrep_newbt *xnr);
+int xrep_newbt_commit(struct xrep_newbt *xnr);
+int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr,
+		union xfs_btree_ptr *ptr);
+unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr);
+
+#endif /* __XFS_SCRUB_NEWBT_H__ */
diff --git a/fs/xfs/scrub/off_bitmap.h b/fs/xfs/scrub/off_bitmap.h
new file mode 100644
index 000000000000..0d3f9e6c1aad
--- /dev/null
+++ b/fs/xfs/scrub/off_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_OFF_BITMAP_H__
+#define __XFS_SCRUB_OFF_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_fileoff_t */
+
+struct xoff_bitmap {
+	struct xbitmap64	offbitmap;
+};
+
+static inline void xoff_bitmap_init(struct xoff_bitmap *bitmap)
+{
+	xbitmap64_init(&bitmap->offbitmap);
+}
+
+static inline void xoff_bitmap_destroy(struct xoff_bitmap *bitmap)
+{
+	xbitmap64_destroy(&bitmap->offbitmap);
+}
+
+static inline int xoff_bitmap_set(struct xoff_bitmap *bitmap,
+		xfs_fileoff_t off, xfs_filblks_t len)
+{
+	return xbitmap64_set(&bitmap->offbitmap, off, len);
+}
+
+static inline int xoff_bitmap_walk(struct xoff_bitmap *bitmap,
+		xbitmap64_walk_fn fn, void *priv)
+{
+	return xbitmap64_walk(&bitmap->offbitmap, fn, priv);
+}
+
+#endif	/* __XFS_SCRUB_OFF_BITMAP_H__ */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index e6155d86f791..7db873672146 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -156,6 +156,16 @@ xchk_parent_validate(
 		goto out_rele;
 	}
 
+	/*
+	 * We cannot yet validate this parent pointer if the directory looks as
+	 * though it has been zapped by the inode record repair code.
+	 */
+	if (xchk_dir_looks_zapped(dp)) {
+		error = -EBUSY;
+		xchk_set_incomplete(sc);
+		goto out_unlock;
+	}
+
 	/* Look for a directory entry in the parent pointing to the child. */
 	error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc);
 	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
@@ -217,6 +227,13 @@ xchk_parent(
 		 */
 		error = xchk_parent_validate(sc, parent_ino);
 	} while (error == -EAGAIN);
+	if (error == -EBUSY) {
+		/*
+		 * We could not scan a directory, so we marked the check
+		 * incomplete.  No further error return is necessary.
+		 */
+		return 0;
+	}
 
 	return error;
 }
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 5671c8153433..183d531875ea 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -6,6 +6,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_bit.h"
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
@@ -17,9 +18,10 @@
 #include "xfs_bmap.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
+#include "scrub/quota.h"
 
 /* Convert a scrub type code to a DQ flag, or return 0 if error. */
-static inline xfs_dqtype_t
+xfs_dqtype_t
 xchk_quota_to_dqtype(
 	struct xfs_scrub	*sc)
 {
@@ -75,14 +77,70 @@ struct xchk_quota_info {
 	xfs_dqid_t		last_id;
 };
 
+/* There's a written block backing this dquot, right? */
+STATIC int
+xchk_quota_item_bmap(
+	struct xfs_scrub	*sc,
+	struct xfs_dquot	*dq,
+	xfs_fileoff_t		offset)
+{
+	struct xfs_bmbt_irec	irec;
+	struct xfs_mount	*mp = sc->mp;
+	int			nmaps = 1;
+	int			error;
+
+	if (!xfs_verify_fileoff(mp, offset)) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+		return 0;
+	}
+
+	if (dq->q_fileoffset != offset) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+		return 0;
+	}
+
+	error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0);
+	if (error)
+		return error;
+
+	if (nmaps != 1) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+		return 0;
+	}
+
+	if (!xfs_verify_fsbno(mp, irec.br_startblock))
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+	if (XFS_FSB_TO_DADDR(mp, irec.br_startblock) != dq->q_blkno)
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+	if (!xfs_bmap_is_written_extent(&irec))
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+	return 0;
+}
+
+/* Complain if a quota timer is incorrectly set. */
+static inline void
+xchk_quota_item_timer(
+	struct xfs_scrub		*sc,
+	xfs_fileoff_t			offset,
+	const struct xfs_dquot_res	*res)
+{
+	if ((res->softlimit && res->count > res->softlimit) ||
+	    (res->hardlimit && res->count > res->hardlimit)) {
+		if (!res->timer)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+	} else {
+		if (res->timer)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+	}
+}
+
 /* Scrub the fields in an individual quota item. */
 STATIC int
 xchk_quota_item(
-	struct xfs_dquot	*dq,
-	xfs_dqtype_t		dqtype,
-	void			*priv)
+	struct xchk_quota_info	*sqi,
+	struct xfs_dquot	*dq)
 {
-	struct xchk_quota_info	*sqi = priv;
 	struct xfs_scrub	*sc = sqi->sc;
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
@@ -94,6 +152,17 @@ xchk_quota_item(
 		return error;
 
 	/*
+	 * We want to validate the bmap record for the storage backing this
+	 * dquot, so we need to lock the dquot and the quota file.  For quota
+	 * operations, the locking order is first the ILOCK and then the dquot.
+	 * However, dqiterate gave us a locked dquot, so drop the dquot lock to
+	 * get the ILOCK.
+	 */
+	xfs_dqunlock(dq);
+	xchk_ilock(sc, XFS_ILOCK_SHARED);
+	xfs_dqlock(dq);
+
+	/*
 	 * Except for the root dquot, the actual dquot we got must either have
 	 * the same or higher id as we saw before.
 	 */
@@ -103,6 +172,11 @@ xchk_quota_item(
 
 	sqi->last_id = dq->q_id;
 
+	error = xchk_quota_item_bmap(sc, dq, offset);
+	xchk_iunlock(sc, XFS_ILOCK_SHARED);
+	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error))
+		return error;
+
 	/*
 	 * Warn if the hard limits are larger than the fs.
 	 * Administrators can do this, though in production this seems
@@ -166,6 +240,10 @@ xchk_quota_item(
 	    dq->q_rtb.count > dq->q_rtb.hardlimit)
 		xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
 
+	xchk_quota_item_timer(sc, offset, &dq->q_blk);
+	xchk_quota_item_timer(sc, offset, &dq->q_ino);
+	xchk_quota_item_timer(sc, offset, &dq->q_rtb);
+
 out:
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 		return -ECANCELED;
@@ -191,7 +269,7 @@ xchk_quota_data_fork(
 		return error;
 
 	/* Check for data fork problems that apply only to quota files. */
-	max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
+	max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk;
 	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
 	for_each_xfs_iext(ifp, &icur, &irec) {
 		if (xchk_should_terminate(sc, &error))
@@ -218,9 +296,11 @@ int
 xchk_quota(
 	struct xfs_scrub	*sc)
 {
-	struct xchk_quota_info	sqi;
+	struct xchk_dqiter	cursor = { };
+	struct xchk_quota_info	sqi = { .sc = sc };
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	struct xfs_dquot	*dq;
 	xfs_dqtype_t		dqtype;
 	int			error = 0;
 
@@ -239,10 +319,15 @@ xchk_quota(
 	 * functions.
 	 */
 	xchk_iunlock(sc, sc->ilock_flags);
-	sqi.sc = sc;
-	sqi.last_id = 0;
-	error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi);
-	xchk_ilock(sc, XFS_ILOCK_EXCL);
+
+	/* Now look for things that the quota verifiers won't complain about. */
+	xchk_dqiter_init(&cursor, sc, dqtype);
+	while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+		error = xchk_quota_item(&sqi, dq);
+		xfs_qm_dqput(dq);
+		if (error)
+			break;
+	}
 	if (error == -ECANCELED)
 		error = 0;
 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h
new file mode 100644
index 000000000000..6c7134ce2385
--- /dev/null
+++ b/fs/xfs/scrub/quota.h
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_QUOTA_H__
+#define __XFS_SCRUB_QUOTA_H__
+
+xfs_dqtype_t xchk_quota_to_dqtype(struct xfs_scrub *sc);
+
+/* dquot iteration code */
+
+struct xchk_dqiter {
+	struct xfs_scrub	*sc;
+
+	/* Quota file that we're walking. */
+	struct xfs_inode	*quota_ip;
+
+	/* Cached data fork mapping for the dquot. */
+	struct xfs_bmbt_irec	bmap;
+
+	/* The next dquot to scan. */
+	uint64_t		id;
+
+	/* Quota type (user/group/project). */
+	xfs_dqtype_t		dqtype;
+
+	/* Data fork sequence number to detect stale mappings. */
+	unsigned int		if_seq;
+};
+
+void xchk_dqiter_init(struct xchk_dqiter *cursor, struct xfs_scrub *sc,
+		xfs_dqtype_t dqtype);
+int xchk_dquot_iter(struct xchk_dqiter *cursor, struct xfs_dquot **dqpp);
+
+#endif /* __XFS_SCRUB_QUOTA_H__ */
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
new file mode 100644
index 000000000000..0bab4c30cb85
--- /dev/null
+++ b/fs/xfs/scrub/quota_repair.c
@@ -0,0 +1,575 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dquot.h"
+#include "xfs_dquot_item.h"
+#include "xfs_reflink.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/quota.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Quota Repair
+ * ============
+ *
+ * Quota repairs are fairly simplistic; we fix everything that the dquot
+ * verifiers complain about, cap any counters or limits that make no sense,
+ * and schedule a quotacheck if we had to fix anything.  We also repair any
+ * data fork extent records that don't apply to metadata files.
+ */
+
+struct xrep_quota_info {
+	struct xfs_scrub	*sc;
+	bool			need_quotacheck;
+};
+
+/*
+ * Allocate a new block into a sparse hole in the quota file backing this
+ * dquot, initialize the block, and commit the whole mess.
+ */
+STATIC int
+xrep_quota_item_fill_bmap_hole(
+	struct xfs_scrub	*sc,
+	struct xfs_dquot	*dq,
+	struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_buf		*bp;
+	struct xfs_mount	*mp = sc->mp;
+	int			nmaps = 1;
+	int			error;
+
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	/* Map a block into the file. */
+	error = xfs_trans_reserve_more(sc->tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+			0);
+	if (error)
+		return error;
+
+	error = xfs_bmapi_write(sc->tp, sc->ip, dq->q_fileoffset,
+			XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0,
+			irec, &nmaps);
+	if (error)
+		return error;
+	if (nmaps != 1)
+		return -ENOSPC;
+
+	dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock);
+
+	trace_xrep_dquot_item_fill_bmap_hole(sc->mp, dq->q_type, dq->q_id);
+
+	/* Initialize the new block. */
+	error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, dq->q_blkno,
+			mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+	if (error)
+		return error;
+	bp->b_ops = &xfs_dquot_buf_ops;
+
+	xfs_qm_init_dquot_blk(sc->tp, dq->q_id, dq->q_type, bp);
+	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+
+	/*
+	 * Finish the mapping transactions and roll one more time to
+	 * disconnect sc->ip from sc->tp.
+	 */
+	error = xrep_defer_finish(sc);
+	if (error)
+		return error;
+	return xfs_trans_roll(&sc->tp);
+}
+
+/* Make sure there's a written block backing this dquot */
+STATIC int
+xrep_quota_item_bmap(
+	struct xfs_scrub	*sc,
+	struct xfs_dquot	*dq,
+	bool			*dirty)
+{
+	struct xfs_bmbt_irec	irec;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	xfs_fileoff_t		offset = dq->q_id / qi->qi_dqperchunk;
+	int			nmaps = 1;
+	int			error;
+
+	/* The computed file offset should always be valid. */
+	if (!xfs_verify_fileoff(mp, offset)) {
+		ASSERT(xfs_verify_fileoff(mp, offset));
+		return -EFSCORRUPTED;
+	}
+	dq->q_fileoffset = offset;
+
+	error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0);
+	if (error)
+		return error;
+
+	if (nmaps < 1 || !xfs_bmap_is_real_extent(&irec)) {
+		/* Hole/delalloc extent; allocate a real block. */
+		error = xrep_quota_item_fill_bmap_hole(sc, dq, &irec);
+		if (error)
+			return error;
+	} else if (irec.br_state != XFS_EXT_NORM) {
+		/* Unwritten extent, which we already took care of? */
+		ASSERT(irec.br_state == XFS_EXT_NORM);
+		return -EFSCORRUPTED;
+	} else if (dq->q_blkno != XFS_FSB_TO_DADDR(mp, irec.br_startblock)) {
+		/*
+		 * If the cached daddr is incorrect, repair probably punched a
+		 * hole out of the quota file and filled it back in with a new
+		 * block.  Update the block mapping in the dquot.
+		 */
+		dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec.br_startblock);
+	}
+
+	*dirty = true;
+	return 0;
+}
+
+/* Reset quota timers if incorrectly set. */
+static inline void
+xrep_quota_item_timer(
+	struct xfs_scrub		*sc,
+	const struct xfs_dquot_res	*res,
+	bool				*dirty)
+{
+	if ((res->softlimit && res->count > res->softlimit) ||
+	    (res->hardlimit && res->count > res->hardlimit)) {
+		if (!res->timer)
+			*dirty = true;
+	} else {
+		if (res->timer)
+			*dirty = true;
+	}
+}
+
+/* Scrub the fields in an individual quota item. */
+STATIC int
+xrep_quota_item(
+	struct xrep_quota_info	*rqi,
+	struct xfs_dquot	*dq)
+{
+	struct xfs_scrub	*sc = rqi->sc;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_ino_t		fs_icount;
+	bool			dirty = false;
+	int			error = 0;
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	/*
+	 * We might need to fix holes in the bmap record for the storage
+	 * backing this dquot, so we need to lock the dquot and the quota file.
+	 * dqiterate gave us a locked dquot, so drop the dquot lock to get the
+	 * ILOCK_EXCL.
+	 */
+	xfs_dqunlock(dq);
+	xchk_ilock(sc, XFS_ILOCK_EXCL);
+	xfs_dqlock(dq);
+
+	error = xrep_quota_item_bmap(sc, dq, &dirty);
+	xchk_iunlock(sc, XFS_ILOCK_EXCL);
+	if (error)
+		return error;
+
+	/* Check the limits. */
+	if (dq->q_blk.softlimit > dq->q_blk.hardlimit) {
+		dq->q_blk.softlimit = dq->q_blk.hardlimit;
+		dirty = true;
+	}
+
+	if (dq->q_ino.softlimit > dq->q_ino.hardlimit) {
+		dq->q_ino.softlimit = dq->q_ino.hardlimit;
+		dirty = true;
+	}
+
+	if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) {
+		dq->q_rtb.softlimit = dq->q_rtb.hardlimit;
+		dirty = true;
+	}
+
+	/*
+	 * Check that usage doesn't exceed physical limits.  However, on
+	 * a reflink filesystem we're allowed to exceed physical space
+	 * if there are no quota limits.  We don't know what the real number
+	 * is, but we can make quotacheck find out for us.
+	 */
+	if (!xfs_has_reflink(mp) && dq->q_blk.count > mp->m_sb.sb_dblocks) {
+		dq->q_blk.reserved -= dq->q_blk.count;
+		dq->q_blk.reserved += mp->m_sb.sb_dblocks;
+		dq->q_blk.count = mp->m_sb.sb_dblocks;
+		rqi->need_quotacheck = true;
+		dirty = true;
+	}
+	fs_icount = percpu_counter_sum(&mp->m_icount);
+	if (dq->q_ino.count > fs_icount) {
+		dq->q_ino.reserved -= dq->q_ino.count;
+		dq->q_ino.reserved += fs_icount;
+		dq->q_ino.count = fs_icount;
+		rqi->need_quotacheck = true;
+		dirty = true;
+	}
+	if (dq->q_rtb.count > mp->m_sb.sb_rblocks) {
+		dq->q_rtb.reserved -= dq->q_rtb.count;
+		dq->q_rtb.reserved += mp->m_sb.sb_rblocks;
+		dq->q_rtb.count = mp->m_sb.sb_rblocks;
+		rqi->need_quotacheck = true;
+		dirty = true;
+	}
+
+	xrep_quota_item_timer(sc, &dq->q_blk, &dirty);
+	xrep_quota_item_timer(sc, &dq->q_ino, &dirty);
+	xrep_quota_item_timer(sc, &dq->q_rtb, &dirty);
+
+	if (!dirty)
+		return 0;
+
+	trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id);
+
+	dq->q_flags |= XFS_DQFLAG_DIRTY;
+	xfs_trans_dqjoin(sc->tp, dq);
+	if (dq->q_id) {
+		xfs_qm_adjust_dqlimits(dq);
+		xfs_qm_adjust_dqtimers(dq);
+	}
+	xfs_trans_log_dquot(sc->tp, dq);
+	error = xfs_trans_roll(&sc->tp);
+	xfs_dqlock(dq);
+	return error;
+}
+
+/* Fix a quota timer so that we can pass the verifier. */
+STATIC void
+xrep_quota_fix_timer(
+	struct xfs_mount	*mp,
+	const struct xfs_disk_dquot *ddq,
+	__be64			softlimit,
+	__be64			countnow,
+	__be32			*timer,
+	time64_t		timelimit)
+{
+	uint64_t		soft = be64_to_cpu(softlimit);
+	uint64_t		count = be64_to_cpu(countnow);
+	time64_t		new_timer;
+	uint32_t		t;
+
+	if (!soft || count <= soft || *timer != 0)
+		return;
+
+	new_timer = xfs_dquot_set_timeout(mp,
+				ktime_get_real_seconds() + timelimit);
+	if (ddq->d_type & XFS_DQTYPE_BIGTIME)
+		t = xfs_dq_unix_to_bigtime(new_timer);
+	else
+		t = new_timer;
+
+	*timer = cpu_to_be32(t);
+}
+
+/* Fix anything the verifiers complain about. */
+STATIC int
+xrep_quota_block(
+	struct xfs_scrub	*sc,
+	xfs_daddr_t		daddr,
+	xfs_dqtype_t		dqtype,
+	xfs_dqid_t		id)
+{
+	struct xfs_dqblk	*dqblk;
+	struct xfs_disk_dquot	*ddq;
+	struct xfs_quotainfo	*qi = sc->mp->m_quotainfo;
+	struct xfs_def_quota	*defq = xfs_get_defquota(qi, dqtype);
+	struct xfs_buf		*bp = NULL;
+	enum xfs_blft		buftype = 0;
+	int			i;
+	int			error;
+
+	error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, daddr,
+			qi->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops);
+	switch (error) {
+	case -EFSBADCRC:
+	case -EFSCORRUPTED:
+		/* Failed verifier, retry read with no ops. */
+		error = xfs_trans_read_buf(sc->mp, sc->tp,
+				sc->mp->m_ddev_targp, daddr, qi->qi_dqchunklen,
+				0, &bp, NULL);
+		if (error)
+			return error;
+		break;
+	case 0:
+		dqblk = bp->b_addr;
+		ddq = &dqblk[0].dd_diskdq;
+
+		/*
+		 * If there's nothing that would impede a dqiterate, we're
+		 * done.
+		 */
+		if ((ddq->d_type & XFS_DQTYPE_REC_MASK) != dqtype ||
+		    id == be32_to_cpu(ddq->d_id)) {
+			xfs_trans_brelse(sc->tp, bp);
+			return 0;
+		}
+		break;
+	default:
+		return error;
+	}
+
+	/* Something's wrong with the block, fix the whole thing. */
+	dqblk = bp->b_addr;
+	bp->b_ops = &xfs_dquot_buf_ops;
+	for (i = 0; i < qi->qi_dqperchunk; i++, dqblk++) {
+		ddq = &dqblk->dd_diskdq;
+
+		trace_xrep_disk_dquot(sc->mp, dqtype, id + i);
+
+		ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+		ddq->d_version = XFS_DQUOT_VERSION;
+		ddq->d_type = dqtype;
+		ddq->d_id = cpu_to_be32(id + i);
+
+		if (xfs_has_bigtime(sc->mp) && ddq->d_id)
+			ddq->d_type |= XFS_DQTYPE_BIGTIME;
+
+		xrep_quota_fix_timer(sc->mp, ddq, ddq->d_blk_softlimit,
+				ddq->d_bcount, &ddq->d_btimer,
+				defq->blk.time);
+
+		xrep_quota_fix_timer(sc->mp, ddq, ddq->d_ino_softlimit,
+				ddq->d_icount, &ddq->d_itimer,
+				defq->ino.time);
+
+		xrep_quota_fix_timer(sc->mp, ddq, ddq->d_rtb_softlimit,
+				ddq->d_rtbcount, &ddq->d_rtbtimer,
+				defq->rtb.time);
+
+		/* We only support v5 filesystems so always set these. */
+		uuid_copy(&dqblk->dd_uuid, &sc->mp->m_sb.sb_meta_uuid);
+		xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+		dqblk->dd_lsn = 0;
+	}
+	switch (dqtype) {
+	case XFS_DQTYPE_USER:
+		buftype = XFS_BLFT_UDQUOT_BUF;
+		break;
+	case XFS_DQTYPE_GROUP:
+		buftype = XFS_BLFT_GDQUOT_BUF;
+		break;
+	case XFS_DQTYPE_PROJ:
+		buftype = XFS_BLFT_PDQUOT_BUF;
+		break;
+	}
+	xfs_trans_buf_set_type(sc->tp, bp, buftype);
+	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
+	return xrep_roll_trans(sc);
+}
+
+/*
+ * Repair a quota file's data fork.  The function returns with the inode
+ * joined.
+ */
+STATIC int
+xrep_quota_data_fork(
+	struct xfs_scrub	*sc,
+	xfs_dqtype_t		dqtype)
+{
+	struct xfs_bmbt_irec	irec = { 0 };
+	struct xfs_iext_cursor	icur;
+	struct xfs_quotainfo	*qi = sc->mp->m_quotainfo;
+	struct xfs_ifork	*ifp;
+	xfs_fileoff_t		max_dqid_off;
+	xfs_fileoff_t		off;
+	xfs_fsblock_t		fsbno;
+	bool			truncate = false;
+	bool			joined = false;
+	int			error = 0;
+
+	error = xrep_metadata_inode_forks(sc);
+	if (error)
+		goto out;
+
+	/* Check for data fork problems that apply only to quota files. */
+	max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk;
+	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+	for_each_xfs_iext(ifp, &icur, &irec) {
+		if (isnullstartblock(irec.br_startblock)) {
+			error = -EFSCORRUPTED;
+			goto out;
+		}
+
+		if (irec.br_startoff > max_dqid_off ||
+		    irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
+			truncate = true;
+			break;
+		}
+
+		/* Convert unwritten extents to real ones. */
+		if (irec.br_state == XFS_EXT_UNWRITTEN) {
+			struct xfs_bmbt_irec	nrec;
+			int			nmap = 1;
+
+			if (!joined) {
+				xfs_trans_ijoin(sc->tp, sc->ip, 0);
+				joined = true;
+			}
+
+			error = xfs_bmapi_write(sc->tp, sc->ip,
+					irec.br_startoff, irec.br_blockcount,
+					XFS_BMAPI_CONVERT, 0, &nrec, &nmap);
+			if (error)
+				goto out;
+			if (nmap != 1) {
+				error = -ENOSPC;
+				goto out;
+			}
+			ASSERT(nrec.br_startoff == irec.br_startoff);
+			ASSERT(nrec.br_blockcount == irec.br_blockcount);
+
+			error = xfs_defer_finish(&sc->tp);
+			if (error)
+				goto out;
+		}
+	}
+
+	if (!joined) {
+		xfs_trans_ijoin(sc->tp, sc->ip, 0);
+		joined = true;
+	}
+
+	if (truncate) {
+		/* Erase everything after the block containing the max dquot */
+		error = xfs_bunmapi_range(&sc->tp, sc->ip, 0,
+				max_dqid_off * sc->mp->m_sb.sb_blocksize,
+				XFS_MAX_FILEOFF);
+		if (error)
+			goto out;
+
+		/* Remove all CoW reservations. */
+		error = xfs_reflink_cancel_cow_blocks(sc->ip, &sc->tp, 0,
+				XFS_MAX_FILEOFF, true);
+		if (error)
+			goto out;
+		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+		/*
+		 * Always re-log the inode so that our permanent transaction
+		 * can keep on rolling it forward in the log.
+		 */
+		xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	}
+
+	/* Now go fix anything that fails the verifiers. */
+	for_each_xfs_iext(ifp, &icur, &irec) {
+		for (fsbno = irec.br_startblock, off = irec.br_startoff;
+		     fsbno < irec.br_startblock + irec.br_blockcount;
+		     fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB,
+				off += XFS_DQUOT_CLUSTER_SIZE_FSB) {
+			error = xrep_quota_block(sc,
+					XFS_FSB_TO_DADDR(sc->mp, fsbno),
+					dqtype, off * qi->qi_dqperchunk);
+			if (error)
+				goto out;
+		}
+	}
+
+out:
+	return error;
+}
+
+/*
+ * Go fix anything in the quota items that we could have been mad about.  Now
+ * that we've checked the quota inode data fork we have to drop ILOCK_EXCL to
+ * use the regular dquot functions.
+ */
+STATIC int
+xrep_quota_problems(
+	struct xfs_scrub	*sc,
+	xfs_dqtype_t		dqtype)
+{
+	struct xchk_dqiter	cursor = { };
+	struct xrep_quota_info	rqi = { .sc = sc };
+	struct xfs_dquot	*dq;
+	int			error;
+
+	xchk_dqiter_init(&cursor, sc, dqtype);
+	while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+		error = xrep_quota_item(&rqi, dq);
+		xfs_qm_dqput(dq);
+		if (error)
+			break;
+	}
+	if (error)
+		return error;
+
+	/* Make a quotacheck happen. */
+	if (rqi.need_quotacheck)
+		xrep_force_quotacheck(sc, dqtype);
+	return 0;
+}
+
+/* Repair all of a quota type's items. */
+int
+xrep_quota(
+	struct xfs_scrub	*sc)
+{
+	xfs_dqtype_t		dqtype;
+	int			error;
+
+	dqtype = xchk_quota_to_dqtype(sc);
+
+	/*
+	 * Re-take the ILOCK so that we can fix any problems that we found
+	 * with the data fork mappings, or with the dquot bufs themselves.
+	 */
+	if (!(sc->ilock_flags & XFS_ILOCK_EXCL))
+		xchk_ilock(sc, XFS_ILOCK_EXCL);
+	error = xrep_quota_data_fork(sc, dqtype);
+	if (error)
+		return error;
+
+	/*
+	 * Finish deferred items and roll the transaction to unjoin the quota
+	 * inode from transaction so that we can unlock the quota inode; we
+	 * play only with dquots from now on.
+	 */
+	error = xrep_defer_finish(sc);
+	if (error)
+		return error;
+	error = xfs_trans_roll(&sc->tp);
+	if (error)
+		return error;
+	xchk_iunlock(sc, sc->ilock_flags);
+
+	/* Fix anything the dquot verifiers don't complain about. */
+	error = xrep_quota_problems(sc, dqtype);
+	if (error)
+		return error;
+
+	return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index e51c1544be63..16462332c897 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -36,16 +36,14 @@ xchk_dir_walk_sf(
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_da_geometry	*geo = mp->m_dir_geo;
 	struct xfs_dir2_sf_entry *sfep;
-	struct xfs_dir2_sf_hdr	*sfp;
+	struct xfs_dir2_sf_hdr	*sfp = dp->i_df.if_data;
 	xfs_ino_t		ino;
 	xfs_dir2_dataptr_t	dapos;
 	unsigned int		i;
 	int			error;
 
 	ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-
-	sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+	ASSERT(sfp != NULL);
 
 	/* dot entry */
 	dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 86a62420e02c..f99eca799809 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -20,6 +20,7 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_extent_busy.h"
 #include "xfs_ag.h"
@@ -31,11 +32,14 @@
 #include "xfs_da_btree.h"
 #include "xfs_attr.h"
 #include "xfs_attr_remote.h"
+#include "xfs_defer.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/repair.h"
 #include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/fsb_bitmap.h"
 #include "scrub/reap.h"
 
 /*
@@ -73,10 +77,10 @@
  * with only the same rmap owner but the block is not owned by something with
  * the same rmap owner, the block will be freed.
  *
- * The caller is responsible for locking the AG headers for the entire rebuild
- * operation so that nothing else can sneak in and change the AG state while
- * we're not looking.  We must also invalidate any buffers associated with
- * @bitmap.
+ * The caller is responsible for locking the AG headers/inode for the entire
+ * rebuild operation so that nothing else can sneak in and change the incore
+ * state while we're not looking.  We must also invalidate any buffers
+ * associated with @bitmap.
  */
 
 /* Information about reaping extents after a repair. */
@@ -247,7 +251,7 @@ xreap_agextent_binval(
 		max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
 				xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
 
-		for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) {
+		for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
 			struct xfs_buf	*bp = NULL;
 			xfs_daddr_t	daddr;
 			int		error;
@@ -377,6 +381,17 @@ xreap_agextent_iter(
 		trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
 
 		rs->force_roll = true;
+
+		if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
+			/*
+			 * If we're unmapping CoW staging extents, remove the
+			 * records from the refcountbt, which will remove the
+			 * rmap record as well.
+			 */
+			xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+			return 0;
+		}
+
 		return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
 				*aglenp, rs->oinfo);
 	}
@@ -395,6 +410,26 @@ xreap_agextent_iter(
 		return 0;
 	}
 
+	/*
+	 * If we're getting rid of CoW staging extents, use deferred work items
+	 * to remove the refcountbt records (which removes the rmap records)
+	 * and free the extent.  We're not worried about the system going down
+	 * here because log recovery walks the refcount btree to clean out the
+	 * CoW staging extents.
+	 */
+	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
+		ASSERT(rs->resv == XFS_AG_RESV_NONE);
+
+		xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+		error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
+				rs->resv, true);
+		if (error)
+			return error;
+
+		rs->force_roll = true;
+		return 0;
+	}
+
 	/* Put blocks back on the AGFL one at a time. */
 	if (rs->resv == XFS_AG_RESV_AGFL) {
 		ASSERT(*aglenp == 1);
@@ -409,13 +444,17 @@ xreap_agextent_iter(
 	/*
 	 * Use deferred frees to get rid of the old btree blocks to try to
 	 * minimize the window in which we could crash and lose the old blocks.
+	 * Add a defer ops barrier every other extent to avoid stressing the
+	 * system with large EFIs.
 	 */
-	error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
+	error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
 			rs->resv, true);
 	if (error)
 		return error;
 
 	rs->deferred++;
+	if (rs->deferred % 2 == 0)
+		xfs_defer_add_barrier(sc->tp);
 	return 0;
 }
 
@@ -425,13 +464,12 @@ xreap_agextent_iter(
  */
 STATIC int
 xreap_agmeta_extent(
-	uint64_t		fsbno,
-	uint64_t		len,
+	uint32_t		agbno,
+	uint32_t		len,
 	void			*priv)
 {
 	struct xreap_state	*rs = priv;
 	struct xfs_scrub	*sc = rs->sc;
-	xfs_agblock_t		agbno = fsbno;
 	xfs_agblock_t		agbno_next = agbno + len;
 	int			error = 0;
 
@@ -496,3 +534,115 @@ xrep_reap_agblocks(
 
 	return 0;
 }
+
+/*
+ * Break a file metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately.  The extent must
+ * not cross an AG boundary.
+ */
+STATIC int
+xreap_fsmeta_extent(
+	uint64_t		fsbno,
+	uint64_t		len,
+	void			*priv)
+{
+	struct xreap_state	*rs = priv;
+	struct xfs_scrub	*sc = rs->sc;
+	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+	xfs_agblock_t		agbno_next = agbno + len;
+	int			error = 0;
+
+	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
+	ASSERT(sc->ip != NULL);
+	ASSERT(!sc->sa.pag);
+
+	/*
+	 * We're reaping blocks after repairing file metadata, which means that
+	 * we have to init the xchk_ag structure ourselves.
+	 */
+	sc->sa.pag = xfs_perag_get(sc->mp, agno);
+	if (!sc->sa.pag)
+		return -EFSCORRUPTED;
+
+	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
+	if (error)
+		goto out_pag;
+
+	while (agbno < agbno_next) {
+		xfs_extlen_t	aglen;
+		bool		crosslinked;
+
+		error = xreap_agextent_select(rs, agbno, agbno_next,
+				&crosslinked, &aglen);
+		if (error)
+			goto out_agf;
+
+		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
+		if (error)
+			goto out_agf;
+
+		if (xreap_want_defer_finish(rs)) {
+			/*
+			 * Holds the AGF buffer across the deferred chain
+			 * processing.
+			 */
+			error = xrep_defer_finish(sc);
+			if (error)
+				goto out_agf;
+			xreap_defer_finish_reset(rs);
+		} else if (xreap_want_roll(rs)) {
+			/*
+			 * Hold the AGF buffer across the transaction roll so
+			 * that we don't have to reattach it to the scrub
+			 * context.
+			 */
+			xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+			xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+			if (error)
+				goto out_agf;
+			xreap_reset(rs);
+		}
+
+		agbno += aglen;
+	}
+
+out_agf:
+	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
+	sc->sa.agf_bp = NULL;
+out_pag:
+	xfs_perag_put(sc->sa.pag);
+	sc->sa.pag = NULL;
+	return error;
+}
+
+/*
+ * Dispose of every block of every fs metadata extent in the bitmap.
+ * Do not use this to dispose of the mappings in an ondisk inode fork.
+ */
+int
+xrep_reap_fsblocks(
+	struct xfs_scrub		*sc,
+	struct xfsb_bitmap		*bitmap,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xreap_state		rs = {
+		.sc			= sc,
+		.oinfo			= oinfo,
+		.resv			= XFS_AG_RESV_NONE,
+	};
+	int				error;
+
+	ASSERT(xfs_has_rmapbt(sc->mp));
+	ASSERT(sc->ip != NULL);
+
+	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
+	if (error)
+		return error;
+
+	if (xreap_dirty(&rs))
+		return xrep_defer_finish(sc);
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index fe24626af164..0b69f16dd98f 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -6,7 +6,12 @@
 #ifndef __XFS_SCRUB_REAP_H__
 #define __XFS_SCRUB_REAP_H__
 
+struct xagb_bitmap;
+struct xfsb_bitmap;
+
 int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
 		const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
+		const struct xfs_owner_info *oinfo);
 
 #endif /* __XFS_SCRUB_REAP_H__ */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 304ea1e1bfb0..bf22f245bbfa 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -441,7 +441,7 @@ xchk_refcountbt_rec(
 	struct xchk_refcbt_records *rrc = bs->private;
 
 	xfs_refcount_btrec_to_irec(rec, &irec);
-	if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) {
+	if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 		return 0;
 	}
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
new file mode 100644
index 000000000000..f38fccc42a20
--- /dev/null
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_error.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Rebuilding the Reference Count Btree
+ * ====================================
+ *
+ * This algorithm is "borrowed" from xfs_repair.  Imagine the rmap
+ * entries as rectangles representing extents of physical blocks, and
+ * that the rectangles can be laid down to allow them to overlap each
+ * other; then we know that we must emit a refcnt btree entry wherever
+ * the amount of overlap changes, i.e. the emission stimulus is
+ * level-triggered:
+ *
+ *                 -    ---
+ *       --      ----- ----   ---        ------
+ * --   ----     ----------- ----     ---------
+ * -------------------------------- -----------
+ * ^ ^  ^^ ^^    ^ ^^ ^^^  ^^^^  ^ ^^ ^  ^     ^
+ * 2 1  23 21    3 43 234  2123  1 01 2  3     0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt.  If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time.  Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ *  - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ *  - Add to the bag all rmaps in the array where startblock == sp.
+ *  - Set np to the physical block where the bag size will change.  This
+ *    is the minimum of (the pblk of the next unprocessed rmap) and
+ *    (startblock + len of each rmap in the bag).
+ *  - Record the bag size as old_bag_size.
+ *
+ *  - While the bag isn't empty,
+ *     - Remove from the bag all rmaps where startblock + len == np.
+ *     - Add to the bag all rmaps in the array where startblock == np.
+ *     - If the bag size isn't old_bag_size, store the refcount entry
+ *       (sp, np - sp, bag_size) in the refcnt btree.
+ *     - If the bag is empty, break out of the inner loop.
+ *     - Set old_bag_size to the bag size
+ *     - Set sp = np.
+ *     - Set np to the physical block where the bag size will change.
+ *       This is the minimum of (the pblk of the next unprocessed rmap)
+ *       and (startblock + len of each rmap in the bag).
+ *
+ * Like all the other repairers, we make a list of all the refcount
+ * records we need, then reinitialize the refcount btree root and
+ * insert all the records.
+ */
+
+/* The only parts of the rmap that we care about for computing refcounts. */
+struct xrep_refc_rmap {
+	xfs_agblock_t		startblock;
+	xfs_extlen_t		blockcount;
+} __packed;
+
+struct xrep_refc {
+	/* refcount extents */
+	struct xfarray		*refcount_records;
+
+	/* new refcountbt information */
+	struct xrep_newbt	new_btree;
+
+	/* old refcountbt blocks */
+	struct xagb_bitmap	old_refcountbt_blocks;
+
+	struct xfs_scrub	*sc;
+
+	/* get_records()'s position in the refcount record array. */
+	xfarray_idx_t		array_cur;
+
+	/* # of refcountbt blocks */
+	xfs_extlen_t		btblocks;
+};
+
+/* Check for any obvious conflicts with this shared/CoW staging extent. */
+STATIC int
+xrep_refc_check_ext(
+	struct xfs_scrub		*sc,
+	const struct xfs_refcount_irec	*rec)
+{
+	enum xbtree_recpacking		outcome;
+	int				error;
+
+	if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL)
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space. */
+	error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock,
+			rec->rc_blockcount, &outcome);
+	if (error)
+		return error;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		return -EFSCORRUPTED;
+
+	/* Must not be an inode chunk. */
+	error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+			rec->rc_startblock, rec->rc_blockcount, &outcome);
+	if (error)
+		return error;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/* Record a reference count extent. */
+STATIC int
+xrep_refc_stash(
+	struct xrep_refc		*rr,
+	enum xfs_refc_domain		domain,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			len,
+	uint64_t			refcount)
+{
+	struct xfs_refcount_irec	irec = {
+		.rc_startblock		= agbno,
+		.rc_blockcount		= len,
+		.rc_domain		= domain,
+	};
+	struct xfs_scrub		*sc = rr->sc;
+	int				error = 0;
+
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
+
+	error = xrep_refc_check_ext(rr->sc, &irec);
+	if (error)
+		return error;
+
+	trace_xrep_refc_found(sc->sa.pag, &irec);
+
+	return xfarray_append(rr->refcount_records, &irec);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_refc_stash_cow(
+	struct xrep_refc		*rr,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			len)
+{
+	return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1);
+}
+
+/* Decide if an rmap could describe a shared extent. */
+static inline bool
+xrep_refc_rmap_shareable(
+	struct xfs_mount		*mp,
+	const struct xfs_rmap_irec	*rmap)
+{
+	/* AG metadata are never sharable */
+	if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+		return false;
+
+	/* Metadata in files are never shareable */
+	if (xfs_internal_inum(mp, rmap->rm_owner))
+		return false;
+
+	/* Metadata and unwritten file blocks are not shareable. */
+	if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+			      XFS_RMAP_UNWRITTEN))
+		return false;
+
+	return true;
+}
+
+/*
+ * Walk along the reverse mapping records until we find one that could describe
+ * a shared extent.
+ */
+STATIC int
+xrep_refc_walk_rmaps(
+	struct xrep_refc	*rr,
+	struct xrep_refc_rmap	*rrm,
+	bool			*have_rec)
+{
+	struct xfs_rmap_irec	rmap;
+	struct xfs_btree_cur	*cur = rr->sc->sa.rmap_cur;
+	struct xfs_mount	*mp = cur->bc_mp;
+	int			have_gt;
+	int			error = 0;
+
+	*have_rec = false;
+
+	/*
+	 * Loop through the remaining rmaps.  Remember CoW staging
+	 * extents and the refcountbt blocks from the old tree for later
+	 * disposal.  We can only share written data fork extents, so
+	 * keep looping until we find an rmap for one.
+	 */
+	do {
+		if (xchk_should_terminate(rr->sc, &error))
+			return error;
+
+		error = xfs_btree_increment(cur, 0, &have_gt);
+		if (error)
+			return error;
+		if (!have_gt)
+			return 0;
+
+		error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
+		if (error)
+			return error;
+		if (XFS_IS_CORRUPT(mp, !have_gt))
+			return -EFSCORRUPTED;
+
+		if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
+			error = xrep_refc_stash_cow(rr, rmap.rm_startblock,
+					rmap.rm_blockcount);
+			if (error)
+				return error;
+		} else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
+			/* refcountbt block, dump it when we're done. */
+			rr->btblocks += rmap.rm_blockcount;
+			error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
+					rmap.rm_startblock, rmap.rm_blockcount);
+			if (error)
+				return error;
+		}
+	} while (!xrep_refc_rmap_shareable(mp, &rmap));
+
+	rrm->startblock = rmap.rm_startblock;
+	rrm->blockcount = rmap.rm_blockcount;
+	*have_rec = true;
+	return 0;
+}
+
+static inline uint32_t
+xrep_refc_encode_startblock(
+	const struct xfs_refcount_irec	*irec)
+{
+	uint32_t			start;
+
+	start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
+	if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
+		start |= XFS_REFC_COWFLAG;
+
+	return start;
+}
+
+/* Sort in the same order as the ondisk records. */
+static int
+xrep_refc_extent_cmp(
+	const void			*a,
+	const void			*b)
+{
+	const struct xfs_refcount_irec	*ap = a;
+	const struct xfs_refcount_irec	*bp = b;
+	uint32_t			sa, sb;
+
+	sa = xrep_refc_encode_startblock(ap);
+	sb = xrep_refc_encode_startblock(bp);
+
+	if (sa > sb)
+		return 1;
+	if (sa < sb)
+		return -1;
+	return 0;
+}
+
+/*
+ * Sort the refcount extents by startblock or else the btree records will be in
+ * the wrong order.  Make sure the records do not overlap in physical space.
+ */
+STATIC int
+xrep_refc_sort_records(
+	struct xrep_refc		*rr)
+{
+	struct xfs_refcount_irec	irec;
+	xfarray_idx_t			cur;
+	enum xfs_refc_domain		dom = XFS_REFC_DOMAIN_SHARED;
+	xfs_agblock_t			next_agbno = 0;
+	int				error;
+
+	error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp,
+			XFARRAY_SORT_KILLABLE);
+	if (error)
+		return error;
+
+	foreach_xfarray_idx(rr->refcount_records, cur) {
+		if (xchk_should_terminate(rr->sc, &error))
+			return error;
+
+		error = xfarray_load(rr->refcount_records, cur, &irec);
+		if (error)
+			return error;
+
+		if (dom == XFS_REFC_DOMAIN_SHARED &&
+		    irec.rc_domain == XFS_REFC_DOMAIN_COW) {
+			dom = irec.rc_domain;
+			next_agbno = 0;
+		}
+
+		if (dom != irec.rc_domain)
+			return -EFSCORRUPTED;
+		if (irec.rc_startblock < next_agbno)
+			return -EFSCORRUPTED;
+
+		next_agbno = irec.rc_startblock + irec.rc_blockcount;
+	}
+
+	return error;
+}
+
+#define RRM_NEXT(r)	((r).startblock + (r).blockcount)
+/*
+ * Find the next block where the refcount changes, given the next rmap we
+ * looked at and the ones we're already tracking.
+ */
+static inline int
+xrep_refc_next_edge(
+	struct xfarray		*rmap_bag,
+	struct xrep_refc_rmap	*next_rrm,
+	bool			next_valid,
+	xfs_agblock_t		*nbnop)
+{
+	struct xrep_refc_rmap	rrm;
+	xfarray_idx_t		array_cur = XFARRAY_CURSOR_INIT;
+	xfs_agblock_t		nbno = NULLAGBLOCK;
+	int			error;
+
+	if (next_valid)
+		nbno = next_rrm->startblock;
+
+	while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1)
+		nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm));
+
+	if (error)
+		return error;
+
+	/*
+	 * We should have found /something/ because either next_rrm is the next
+	 * interesting rmap to look at after emitting this refcount extent, or
+	 * there are other rmaps in rmap_bag contributing to the current
+	 * sharing count.  But if something is seriously wrong, bail out.
+	 */
+	if (nbno == NULLAGBLOCK)
+		return -EFSCORRUPTED;
+
+	*nbnop = nbno;
+	return 0;
+}
+
+/*
+ * Walk forward through the rmap btree to collect all rmaps starting at
+ * @bno in @rmap_bag.  These represent the file(s) that share ownership of
+ * the current block.  Upon return, the rmap cursor points to the last record
+ * satisfying the startblock constraint.
+ */
+static int
+xrep_refc_push_rmaps_at(
+	struct xrep_refc	*rr,
+	struct xfarray		*rmap_bag,
+	xfs_agblock_t		bno,
+	struct xrep_refc_rmap	*rrm,
+	bool			*have,
+	uint64_t		*stack_sz)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	int			have_gt;
+	int			error;
+
+	while (*have && rrm->startblock == bno) {
+		error = xfarray_store_anywhere(rmap_bag, rrm);
+		if (error)
+			return error;
+		(*stack_sz)++;
+		error = xrep_refc_walk_rmaps(rr, rrm, have);
+		if (error)
+			return error;
+	}
+
+	error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
+	if (error)
+		return error;
+	if (XFS_IS_CORRUPT(sc->mp, !have_gt))
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/* Iterate all the rmap records to generate reference count data. */
+STATIC int
+xrep_refc_find_refcounts(
+	struct xrep_refc	*rr)
+{
+	struct xrep_refc_rmap	rrm;
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfarray		*rmap_bag;
+	char			*descr;
+	uint64_t		old_stack_sz;
+	uint64_t		stack_sz = 0;
+	xfs_agblock_t		sbno;
+	xfs_agblock_t		cbno;
+	xfs_agblock_t		nbno;
+	bool			have;
+	int			error;
+
+	xrep_ag_btcur_init(sc, &sc->sa);
+
+	/*
+	 * Set up a sparse array to store all the rmap records that we're
+	 * tracking to generate a reference count record.  If this exceeds
+	 * MAXREFCOUNT, we clamp rc_refcount.
+	 */
+	descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+	error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap),
+			&rmap_bag);
+	kfree(descr);
+	if (error)
+		goto out_cur;
+
+	/* Start the rmapbt cursor to the left of all records. */
+	error = xfs_btree_goto_left_edge(sc->sa.rmap_cur);
+	if (error)
+		goto out_bag;
+
+	/* Process reverse mappings into refcount data. */
+	while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
+		/* Push all rmaps with pblk == sbno onto the stack */
+		error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+		if (error)
+			goto out_bag;
+		if (!have)
+			break;
+		sbno = cbno = rrm.startblock;
+		error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno,
+					&rrm, &have, &stack_sz);
+		if (error)
+			goto out_bag;
+
+		/* Set nbno to the bno of the next refcount change */
+		error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno);
+		if (error)
+			goto out_bag;
+
+		ASSERT(nbno > sbno);
+		old_stack_sz = stack_sz;
+
+		/* While stack isn't empty... */
+		while (stack_sz) {
+			xfarray_idx_t	array_cur = XFARRAY_CURSOR_INIT;
+
+			/* Pop all rmaps that end at nbno */
+			while ((error = xfarray_iter(rmap_bag, &array_cur,
+								&rrm)) == 1) {
+				if (RRM_NEXT(rrm) != nbno)
+					continue;
+				error = xfarray_unset(rmap_bag, array_cur - 1);
+				if (error)
+					goto out_bag;
+				stack_sz--;
+			}
+			if (error)
+				goto out_bag;
+
+			/* Push array items that start at nbno */
+			error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+			if (error)
+				goto out_bag;
+			if (have) {
+				error = xrep_refc_push_rmaps_at(rr, rmap_bag,
+						nbno, &rrm, &have, &stack_sz);
+				if (error)
+					goto out_bag;
+			}
+
+			/* Emit refcount if necessary */
+			ASSERT(nbno > cbno);
+			if (stack_sz != old_stack_sz) {
+				if (old_stack_sz > 1) {
+					error = xrep_refc_stash(rr,
+							XFS_REFC_DOMAIN_SHARED,
+							cbno, nbno - cbno,
+							old_stack_sz);
+					if (error)
+						goto out_bag;
+				}
+				cbno = nbno;
+			}
+
+			/* Stack empty, go find the next rmap */
+			if (stack_sz == 0)
+				break;
+			old_stack_sz = stack_sz;
+			sbno = nbno;
+
+			/* Set nbno to the bno of the next refcount change */
+			error = xrep_refc_next_edge(rmap_bag, &rrm, have,
+					&nbno);
+			if (error)
+				goto out_bag;
+
+			ASSERT(nbno > sbno);
+		}
+	}
+
+	ASSERT(stack_sz == 0);
+out_bag:
+	xfarray_destroy(rmap_bag);
+out_cur:
+	xchk_ag_btcur_free(&sc->sa);
+	return error;
+}
+#undef RRM_NEXT
+
+/* Retrieve refcountbt data for bulk load. */
+STATIC int
+xrep_refc_get_records(
+	struct xfs_btree_cur		*cur,
+	unsigned int			idx,
+	struct xfs_btree_block		*block,
+	unsigned int			nr_wanted,
+	void				*priv)
+{
+	struct xfs_refcount_irec	*irec = &cur->bc_rec.rc;
+	struct xrep_refc		*rr = priv;
+	union xfs_btree_rec		*block_rec;
+	unsigned int			loaded;
+	int				error;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		error = xfarray_load(rr->refcount_records, rr->array_cur++,
+				irec);
+		if (error)
+			return error;
+
+		block_rec = xfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_refc_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_refc        *rr = priv;
+
+	return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Update the AGF counters. */
+STATIC int
+xrep_refc_reset_counters(
+	struct xrep_refc	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_perag	*pag = sc->sa.pag;
+
+	/*
+	 * After we commit the new btree to disk, it is possible that the
+	 * process to reap the old btree blocks will race with the AIL trying
+	 * to checkpoint the old btree blocks into the filesystem.  If the new
+	 * tree is shorter than the old one, the refcountbt write verifier will
+	 * fail and the AIL will shut down the filesystem.
+	 *
+	 * To avoid this, save the old incore btree height values as the alt
+	 * height values before re-initializing the perag info from the updated
+	 * AGF to capture all the new values.
+	 */
+	pag->pagf_repair_refcount_level = pag->pagf_refcount_level;
+
+	/* Reinitialize with the values we just logged. */
+	return xrep_reinit_pagf(sc);
+}
+
+/*
+ * Use the collected refcount information to stage a new refcount btree.  If
+ * this is successful we'll return with the new btree root information logged
+ * to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_refc_build_new_tree(
+	struct xrep_refc	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_btree_cur	*refc_cur;
+	struct xfs_perag	*pag = sc->sa.pag;
+	xfs_fsblock_t		fsbno;
+	int			error;
+
+	error = xrep_refc_sort_records(rr);
+	if (error)
+		return error;
+
+	/*
+	 * Prepare to construct the new btree by reserving disk space for the
+	 * new btree and setting up all the accounting information we'll need
+	 * to root the new btree while it's under construction and before we
+	 * attach it to the AG header.
+	 */
+	fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp));
+	xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno,
+			XFS_AG_RESV_METADATA);
+	rr->new_btree.bload.get_records = xrep_refc_get_records;
+	rr->new_btree.bload.claim_block = xrep_refc_claim_block;
+
+	/* Compute how many blocks we'll need. */
+	refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake,
+			pag);
+	error = xfs_btree_bload_compute_geometry(refc_cur,
+			&rr->new_btree.bload,
+			xfarray_length(rr->refcount_records));
+	if (error)
+		goto err_cur;
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		goto err_cur;
+
+	/* Reserve the space we'll need for the new btree. */
+	error = xrep_newbt_alloc_blocks(&rr->new_btree,
+			rr->new_btree.bload.nr_blocks);
+	if (error)
+		goto err_cur;
+
+	/*
+	 * Due to btree slack factors, it's possible for a new btree to be one
+	 * level taller than the old btree.  Update the incore btree height so
+	 * that we don't trip the verifiers when writing the new btree blocks
+	 * to disk.
+	 */
+	pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height;
+
+	/* Add all observed refcount records. */
+	rr->array_cur = XFARRAY_CURSOR_INIT;
+	error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
+	if (error)
+		goto err_level;
+
+	/*
+	 * Install the new btree in the AG header.  After this point the old
+	 * btree is no longer accessible and the new tree is live.
+	 */
+	xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp);
+	xfs_btree_del_cursor(refc_cur, 0);
+
+	/* Reset the AGF counters now that we've changed the btree shape. */
+	error = xrep_refc_reset_counters(rr);
+	if (error)
+		goto err_newbt;
+
+	/* Dispose of any unused blocks and the accounting information. */
+	error = xrep_newbt_commit(&rr->new_btree);
+	if (error)
+		return error;
+
+	return xrep_roll_ag_trans(sc);
+
+err_level:
+	pag->pagf_repair_refcount_level = 0;
+err_cur:
+	xfs_btree_del_cursor(refc_cur, error);
+err_newbt:
+	xrep_newbt_cancel(&rr->new_btree);
+	return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_refc_remove_old_tree(
+	struct xrep_refc	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_perag	*pag = sc->sa.pag;
+	int			error;
+
+	/* Free the old refcountbt blocks if they're not in use. */
+	error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks,
+			&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
+	if (error)
+		return error;
+
+	/*
+	 * Now that we've zapped all the old refcountbt blocks we can turn off
+	 * the alternate height mechanism and reset the per-AG space
+	 * reservations.
+	 */
+	pag->pagf_repair_refcount_level = 0;
+	sc->flags |= XREP_RESET_PERAG_RESV;
+	return 0;
+}
+
+/* Rebuild the refcount btree. */
+int
+xrep_refcountbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_refc	*rr;
+	struct xfs_mount	*mp = sc->mp;
+	char			*descr;
+	int			error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_has_rmapbt(mp))
+		return -EOPNOTSUPP;
+
+	rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS);
+	if (!rr)
+		return -ENOMEM;
+	rr->sc = sc;
+
+	/* Set up enough storage to handle one refcount record per block. */
+	descr = xchk_xfile_ag_descr(sc, "reference count records");
+	error = xfarray_create(descr, mp->m_sb.sb_agblocks,
+			sizeof(struct xfs_refcount_irec),
+			&rr->refcount_records);
+	kfree(descr);
+	if (error)
+		goto out_rr;
+
+	/* Collect all reference counts. */
+	xagb_bitmap_init(&rr->old_refcountbt_blocks);
+	error = xrep_refc_find_refcounts(rr);
+	if (error)
+		goto out_bitmap;
+
+	/* Rebuild the refcount information. */
+	error = xrep_refc_build_new_tree(rr);
+	if (error)
+		goto out_bitmap;
+
+	/* Kill the old tree. */
+	error = xrep_refc_remove_old_tree(rr);
+	if (error)
+		goto out_bitmap;
+
+out_bitmap:
+	xagb_bitmap_destroy(&rr->old_refcountbt_blocks);
+	xfarray_destroy(rr->refcount_records);
+out_rr:
+	kfree(rr);
+	return error;
+}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1b8b5439f2d7..745d5b8f405a 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -27,6 +27,9 @@
 #include "xfs_quota.h"
 #include "xfs_qm.h"
 #include "xfs_defer.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_reflink.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -176,6 +179,16 @@ xrep_roll_ag_trans(
 	return 0;
 }
 
+/* Roll the scrub transaction, holding the primary metadata locked. */
+int
+xrep_roll_trans(
+	struct xfs_scrub	*sc)
+{
+	if (!sc->ip)
+		return xrep_roll_ag_trans(sc);
+	return xfs_trans_roll_inode(&sc->tp, sc->ip);
+}
+
 /* Finish all deferred work attached to the repair transaction. */
 int
 xrep_defer_finish(
@@ -673,6 +686,7 @@ xrep_find_ag_btree_roots(
 	return error;
 }
 
+#ifdef CONFIG_XFS_QUOTA
 /* Force a quotacheck the next time we mount. */
 void
 xrep_force_quotacheck(
@@ -699,10 +713,10 @@ xrep_force_quotacheck(
  *
  * This function ensures that the appropriate dquots are attached to an inode.
  * We cannot allow the dquot code to allocate an on-disk dquot block here
- * because we're already in transaction context with the inode locked.  The
- * on-disk dquot should already exist anyway.  If the quota code signals
- * corruption or missing quota information, schedule quotacheck, which will
- * repair corruptions in the quota metadata.
+ * because we're already in transaction context.  The on-disk dquot should
+ * already exist anyway.  If the quota code signals corruption or missing quota
+ * information, schedule quotacheck, which will repair corruptions in the quota
+ * metadata.
  */
 int
 xrep_ino_dqattach(
@@ -710,7 +724,10 @@ xrep_ino_dqattach(
 {
 	int			error;
 
-	error = xfs_qm_dqattach_locked(sc->ip, false);
+	ASSERT(sc->tp != NULL);
+	ASSERT(sc->ip != NULL);
+
+	error = xfs_qm_dqattach(sc->ip);
 	switch (error) {
 	case -EFSBADCRC:
 	case -EFSCORRUPTED:
@@ -734,3 +751,367 @@ xrep_ino_dqattach(
 
 	return error;
 }
+#endif /* CONFIG_XFS_QUOTA */
+
+/*
+ * Ensure that the inode being repaired is ready to handle a certain number of
+ * extents, or return EFSCORRUPTED.  Caller must hold the ILOCK of the inode
+ * being repaired and have joined it to the scrub transaction.
+ */
+int
+xrep_ino_ensure_extent_count(
+	struct xfs_scrub	*sc,
+	int			whichfork,
+	xfs_extnum_t		nextents)
+{
+	xfs_extnum_t		max_extents;
+	bool			inode_has_nrext64;
+
+	inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip);
+	max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork);
+	if (nextents <= max_extents)
+		return 0;
+	if (inode_has_nrext64)
+		return -EFSCORRUPTED;
+	if (!xfs_has_large_extent_counts(sc->mp))
+		return -EFSCORRUPTED;
+
+	max_extents = xfs_iext_max_nextents(true, whichfork);
+	if (nextents > max_extents)
+		return -EFSCORRUPTED;
+
+	sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	return 0;
+}
+
+/*
+ * Initialize all the btree cursors for an AG repair except for the btree that
+ * we're rebuilding.
+ */
+void
+xrep_ag_btcur_init(
+	struct xfs_scrub	*sc,
+	struct xchk_ag		*sa)
+{
+	struct xfs_mount	*mp = sc->mp;
+
+	/* Set up a bnobt cursor for cross-referencing. */
+	if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
+	    sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
+		sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+				sc->sa.pag, XFS_BTNUM_BNO);
+		sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+				sc->sa.pag, XFS_BTNUM_CNT);
+	}
+
+	/* Set up a inobt cursor for cross-referencing. */
+	if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
+	    sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
+		sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
+				sa->agi_bp, XFS_BTNUM_INO);
+		if (xfs_has_finobt(mp))
+			sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
+					sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
+	}
+
+	/* Set up a rmapbt cursor for cross-referencing. */
+	if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
+	    xfs_has_rmapbt(mp))
+		sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+				sc->sa.pag);
+
+	/* Set up a refcountbt cursor for cross-referencing. */
+	if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
+	    xfs_has_reflink(mp))
+		sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+				sa->agf_bp, sc->sa.pag);
+}
+
+/*
+ * Reinitialize the in-core AG state after a repair by rereading the AGF
+ * buffer.  We had better get the same AGF buffer as the one that's attached
+ * to the scrub context.
+ */
+int
+xrep_reinit_pagf(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_perag	*pag = sc->sa.pag;
+	struct xfs_buf		*bp;
+	int			error;
+
+	ASSERT(pag);
+	ASSERT(xfs_perag_initialised_agf(pag));
+
+	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
+	error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp);
+	if (error)
+		return error;
+
+	if (bp != sc->sa.agf_bp) {
+		ASSERT(bp == sc->sa.agf_bp);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+/*
+ * Reinitialize the in-core AG state after a repair by rereading the AGI
+ * buffer.  We had better get the same AGI buffer as the one that's attached
+ * to the scrub context.
+ */
+int
+xrep_reinit_pagi(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_perag	*pag = sc->sa.pag;
+	struct xfs_buf		*bp;
+	int			error;
+
+	ASSERT(pag);
+	ASSERT(xfs_perag_initialised_agi(pag));
+
+	clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
+	error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
+	if (error)
+		return error;
+
+	if (bp != sc->sa.agi_bp) {
+		ASSERT(bp == sc->sa.agi_bp);
+		return -EFSCORRUPTED;
+	}
+
+	return 0;
+}
+
+/*
+ * Given an active reference to a perag structure, load AG headers and cursors.
+ * This should only be called to scan an AG while repairing file-based metadata.
+ */
+int
+xrep_ag_init(
+	struct xfs_scrub	*sc,
+	struct xfs_perag	*pag,
+	struct xchk_ag		*sa)
+{
+	int			error;
+
+	ASSERT(!sa->pag);
+
+	error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
+	if (error)
+		return error;
+
+	error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp);
+	if (error)
+		return error;
+
+	/* Grab our own passive reference from the caller's ref. */
+	sa->pag = xfs_perag_hold(pag);
+	xrep_ag_btcur_init(sc, sa);
+	return 0;
+}
+
+/* Reinitialize the per-AG block reservation for the AG we just fixed. */
+int
+xrep_reset_perag_resv(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	if (!(sc->flags & XREP_RESET_PERAG_RESV))
+		return 0;
+
+	ASSERT(sc->sa.pag != NULL);
+	ASSERT(sc->ops->type == ST_PERAG);
+	ASSERT(sc->tp);
+
+	sc->flags &= ~XREP_RESET_PERAG_RESV;
+	error = xfs_ag_resv_free(sc->sa.pag);
+	if (error)
+		goto out;
+	error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
+	if (error == -ENOSPC) {
+		xfs_err(sc->mp,
+"Insufficient free space to reset per-AG reservation for AG %u after repair.",
+				sc->sa.pag->pag_agno);
+		error = 0;
+	}
+
+out:
+	return error;
+}
+
+/* Decide if we are going to call the repair function for a scrub type. */
+bool
+xrep_will_attempt(
+	struct xfs_scrub	*sc)
+{
+	/* Userspace asked us to rebuild the structure regardless. */
+	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
+		return true;
+
+	/* Let debug users force us into the repair routines. */
+	if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+		return true;
+
+	/* Metadata is corrupt or failed cross-referencing. */
+	if (xchk_needs_repair(sc->sm))
+		return true;
+
+	return false;
+}
+
+/* Try to fix some part of a metadata inode by calling another scrubber. */
+STATIC int
+xrep_metadata_inode_subtype(
+	struct xfs_scrub	*sc,
+	unsigned int		scrub_type)
+{
+	__u32			smtype = sc->sm->sm_type;
+	__u32			smflags = sc->sm->sm_flags;
+	unsigned int		sick_mask = sc->sick_mask;
+	int			error;
+
+	/*
+	 * Let's see if the inode needs repair.  We're going to open-code calls
+	 * to the scrub and repair functions so that we can hang on to the
+	 * resources that we already acquired instead of using the standard
+	 * setup/teardown routines.
+	 */
+	sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+	sc->sm->sm_type = scrub_type;
+
+	switch (scrub_type) {
+	case XFS_SCRUB_TYPE_INODE:
+		error = xchk_inode(sc);
+		break;
+	case XFS_SCRUB_TYPE_BMBTD:
+		error = xchk_bmap_data(sc);
+		break;
+	case XFS_SCRUB_TYPE_BMBTA:
+		error = xchk_bmap_attr(sc);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+	}
+	if (error)
+		goto out;
+
+	if (!xrep_will_attempt(sc))
+		goto out;
+
+	/*
+	 * Repair some part of the inode.  This will potentially join the inode
+	 * to the transaction.
+	 */
+	switch (scrub_type) {
+	case XFS_SCRUB_TYPE_INODE:
+		error = xrep_inode(sc);
+		break;
+	case XFS_SCRUB_TYPE_BMBTD:
+		error = xrep_bmap(sc, XFS_DATA_FORK, false);
+		break;
+	case XFS_SCRUB_TYPE_BMBTA:
+		error = xrep_bmap(sc, XFS_ATTR_FORK, false);
+		break;
+	}
+	if (error)
+		goto out;
+
+	/*
+	 * Finish all deferred intent items and then roll the transaction so
+	 * that the inode will not be joined to the transaction when we exit
+	 * the function.
+	 */
+	error = xfs_defer_finish(&sc->tp);
+	if (error)
+		goto out;
+	error = xfs_trans_roll(&sc->tp);
+	if (error)
+		goto out;
+
+	/*
+	 * Clear the corruption flags and re-check the metadata that we just
+	 * repaired.
+	 */
+	sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+
+	switch (scrub_type) {
+	case XFS_SCRUB_TYPE_INODE:
+		error = xchk_inode(sc);
+		break;
+	case XFS_SCRUB_TYPE_BMBTD:
+		error = xchk_bmap_data(sc);
+		break;
+	case XFS_SCRUB_TYPE_BMBTA:
+		error = xchk_bmap_attr(sc);
+		break;
+	}
+	if (error)
+		goto out;
+
+	/* If corruption persists, the repair has failed. */
+	if (xchk_needs_repair(sc->sm)) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+out:
+	sc->sick_mask = sick_mask;
+	sc->sm->sm_type = smtype;
+	sc->sm->sm_flags = smflags;
+	return error;
+}
+
+/*
+ * Repair the ondisk forks of a metadata inode.  The caller must ensure that
+ * sc->ip points to the metadata inode and the ILOCK is held on that inode.
+ * The inode must not be joined to the transaction before the call, and will
+ * not be afterwards.
+ */
+int
+xrep_metadata_inode_forks(
+	struct xfs_scrub	*sc)
+{
+	bool			dirty = false;
+	int			error;
+
+	/* Repair the inode record and the data fork. */
+	error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
+	if (error)
+		return error;
+
+	error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
+	if (error)
+		return error;
+
+	/* Make sure the attr fork looks ok before we delete it. */
+	error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
+	if (error)
+		return error;
+
+	/* Clear the reflink flag since metadata never shares. */
+	if (xfs_is_reflink_inode(sc->ip)) {
+		dirty = true;
+		xfs_trans_ijoin(sc->tp, sc->ip, 0);
+		error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * If we modified the inode, roll the transaction but don't rejoin the
+	 * inode to the new transaction because xrep_bmap_data can do that.
+	 */
+	if (dirty) {
+		error = xfs_trans_roll(&sc->tp);
+		if (error)
+			return error;
+		dirty = false;
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 60d2a9ae5f2e..17114327e6fa 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -28,15 +28,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
 /* Repair helpers */
 
 int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
+bool xrep_will_attempt(struct xfs_scrub *sc);
 void xrep_failure(struct xfs_mount *mp);
 int xrep_roll_ag_trans(struct xfs_scrub *sc);
+int xrep_roll_trans(struct xfs_scrub *sc);
 int xrep_defer_finish(struct xfs_scrub *sc);
 bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
 		enum xfs_ag_resv_type type);
 xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
 
+static inline int
+xrep_trans_commit(
+	struct xfs_scrub	*sc)
+{
+	int error = xfs_trans_commit(sc->tp);
+
+	sc->tp = NULL;
+	return error;
+}
+
 struct xbitmap;
 struct xagb_bitmap;
+struct xfsb_bitmap;
 
 int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
 
@@ -57,8 +70,35 @@ struct xrep_find_ag_btree {
 
 int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
 		struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
+
+#ifdef CONFIG_XFS_QUOTA
 void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
 int xrep_ino_dqattach(struct xfs_scrub *sc);
+#else
+# define xrep_force_quotacheck(sc, type)	((void)0)
+# define xrep_ino_dqattach(sc)			(0)
+#endif /* CONFIG_XFS_QUOTA */
+
+int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork,
+		xfs_extnum_t nextents);
+int xrep_reset_perag_resv(struct xfs_scrub *sc);
+int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
+int xrep_metadata_inode_forks(struct xfs_scrub *sc);
+
+/* Repair setup functions */
+int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
+
+struct xfs_imap;
+int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap);
+
+void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
+int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
+		struct xchk_ag *sa);
+
+/* Metadata revalidators */
+
+int xrep_revalidate_allocbt(struct xfs_scrub *sc);
+int xrep_revalidate_iallocbt(struct xfs_scrub *sc);
 
 /* Metadata repairers */
 
@@ -67,9 +107,34 @@ int xrep_superblock(struct xfs_scrub *sc);
 int xrep_agf(struct xfs_scrub *sc);
 int xrep_agfl(struct xfs_scrub *sc);
 int xrep_agi(struct xfs_scrub *sc);
+int xrep_allocbt(struct xfs_scrub *sc);
+int xrep_iallocbt(struct xfs_scrub *sc);
+int xrep_refcountbt(struct xfs_scrub *sc);
+int xrep_inode(struct xfs_scrub *sc);
+int xrep_bmap_data(struct xfs_scrub *sc);
+int xrep_bmap_attr(struct xfs_scrub *sc);
+int xrep_bmap_cow(struct xfs_scrub *sc);
+
+#ifdef CONFIG_XFS_RT
+int xrep_rtbitmap(struct xfs_scrub *sc);
+#else
+# define xrep_rtbitmap			xrep_notsupported
+#endif /* CONFIG_XFS_RT */
+
+#ifdef CONFIG_XFS_QUOTA
+int xrep_quota(struct xfs_scrub *sc);
+#else
+# define xrep_quota			xrep_notsupported
+#endif /* CONFIG_XFS_QUOTA */
+
+int xrep_reinit_pagf(struct xfs_scrub *sc);
+int xrep_reinit_pagi(struct xfs_scrub *sc);
 
 #else
 
+#define xrep_ino_dqattach(sc)	(0)
+#define xrep_will_attempt(sc)	(false)
+
 static inline int
 xrep_attempt(
 	struct xfs_scrub	*sc,
@@ -87,11 +152,45 @@ xrep_calc_ag_resblks(
 	return 0;
 }
 
+static inline int
+xrep_reset_perag_resv(
+	struct xfs_scrub	*sc)
+{
+	if (!(sc->flags & XREP_RESET_PERAG_RESV))
+		return 0;
+
+	ASSERT(0);
+	return -EOPNOTSUPP;
+}
+
+/* repair setup functions for no-repair */
+static inline int
+xrep_setup_nothing(
+	struct xfs_scrub	*sc)
+{
+	return 0;
+}
+#define xrep_setup_ag_allocbt		xrep_setup_nothing
+
+#define xrep_setup_inode(sc, imap)	((void)0)
+
+#define xrep_revalidate_allocbt		(NULL)
+#define xrep_revalidate_iallocbt	(NULL)
+
 #define xrep_probe			xrep_notsupported
 #define xrep_superblock			xrep_notsupported
 #define xrep_agf			xrep_notsupported
 #define xrep_agfl			xrep_notsupported
 #define xrep_agi			xrep_notsupported
+#define xrep_allocbt			xrep_notsupported
+#define xrep_iallocbt			xrep_notsupported
+#define xrep_refcountbt			xrep_notsupported
+#define xrep_inode			xrep_notsupported
+#define xrep_bmap_data			xrep_notsupported
+#define xrep_bmap_attr			xrep_notsupported
+#define xrep_bmap_cow			xrep_notsupported
+#define xrep_rtbitmap			xrep_notsupported
+#define xrep_quota			xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index d29a26ecddd6..c99d1714f283 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -24,6 +24,7 @@
 #include "scrub/common.h"
 #include "scrub/btree.h"
 #include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
 
 /*
  * Set us up to scrub reverse mapping btrees.
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 41a1d89ae8e6..441ca9977652 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -14,17 +14,33 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
+#include "xfs_bit.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/rtbitmap.h"
 
 /* Set us up with the realtime metadata locked. */
 int
 xchk_setup_rtbitmap(
 	struct xfs_scrub	*sc)
 {
+	struct xfs_mount	*mp = sc->mp;
+	struct xchk_rtbitmap	*rtb;
 	int			error;
 
-	error = xchk_trans_alloc(sc, 0);
+	rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
+	if (!rtb)
+		return -ENOMEM;
+	sc->buf = rtb;
+
+	if (xchk_could_repair(sc)) {
+		error = xrep_setup_rtbitmap(sc, rtb);
+		if (error)
+			return error;
+	}
+
+	error = xchk_trans_alloc(sc, rtb->resblks);
 	if (error)
 		return error;
 
@@ -32,7 +48,22 @@ xchk_setup_rtbitmap(
 	if (error)
 		return error;
 
+	error = xchk_ino_dqattach(sc);
+	if (error)
+		return error;
+
 	xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+
+	/*
+	 * Now that we've locked the rtbitmap, we can't race with growfsrt
+	 * trying to expand the bitmap or change the size of the rt volume.
+	 * Hence it is safe to compute and check the geometry values.
+	 */
+	if (mp->m_sb.sb_rblocks) {
+		rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+		rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
+		rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents);
+	}
 	return 0;
 }
 
@@ -63,21 +94,30 @@ STATIC int
 xchk_rtbitmap_check_extents(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_mount	*mp = sc->mp;
 	struct xfs_bmbt_irec	map;
-	xfs_rtblock_t		off;
-	int			nmap;
+	struct xfs_iext_cursor	icur;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_inode	*ip = sc->ip;
+	xfs_fileoff_t		off = 0;
+	xfs_fileoff_t		endoff;
 	int			error = 0;
 
-	for (off = 0; off < mp->m_sb.sb_rbmblocks;) {
+	/* Mappings may not cross or lie beyond EOF. */
+	endoff = XFS_B_TO_FSB(mp, ip->i_disk_size);
+	if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff);
+		return 0;
+	}
+
+	while (off < endoff) {
+		int		nmap = 1;
+
 		if (xchk_should_terminate(sc, &error) ||
 		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 			break;
 
 		/* Make sure we have a written extent. */
-		nmap = 1;
-		error = xfs_bmapi_read(mp->m_rbmip, off,
-				mp->m_sb.sb_rbmblocks - off, &map, &nmap,
+		error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap,
 				XFS_DATA_FORK);
 		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
 			break;
@@ -98,12 +138,48 @@ int
 xchk_rtbitmap(
 	struct xfs_scrub	*sc)
 {
+	struct xfs_mount	*mp = sc->mp;
+	struct xchk_rtbitmap	*rtb = sc->buf;
 	int			error;
 
-	/* Is the size of the rtbitmap correct? */
-	if (sc->mp->m_rbmip->i_disk_size !=
-	    XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) {
-		xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
+	/* Is sb_rextents correct? */
+	if (mp->m_sb.sb_rextents != rtb->rextents) {
+		xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+		return 0;
+	}
+
+	/* Is sb_rextslog correct? */
+	if (mp->m_sb.sb_rextslog != rtb->rextslog) {
+		xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+		return 0;
+	}
+
+	/*
+	 * Is sb_rbmblocks large enough to handle the current rt volume?  In no
+	 * case can we exceed 4bn bitmap blocks since the super field is a u32.
+	 */
+	if (rtb->rbmblocks > U32_MAX) {
+		xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+		return 0;
+	}
+	if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) {
+		xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+		return 0;
+	}
+
+	/* The bitmap file length must be aligned to an fsblock. */
+	if (mp->m_rbmip->i_disk_size & mp->m_blockmask) {
+		xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+		return 0;
+	}
+
+	/*
+	 * Is the bitmap file itself large enough to handle the rt volume?
+	 * growfsrt expands the bitmap file before updating sb_rextents, so the
+	 * file can be larger than sb_rbmblocks.
+	 */
+	if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) {
+		xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
 		return 0;
 	}
 
@@ -116,12 +192,11 @@ xchk_rtbitmap(
 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 		return error;
 
-	error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
+	error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc);
 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
-		goto out;
+		return error;
 
-out:
-	return error;
+	return 0;
 }
 
 /* xref check that the extent is not free in the rtbitmap */
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
new file mode 100644
index 000000000000..85304ff019e1
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTBITMAP_H__
+#define __XFS_SCRUB_RTBITMAP_H__
+
+struct xchk_rtbitmap {
+	uint64_t		rextents;
+	uint64_t		rbmblocks;
+	unsigned int		rextslog;
+	unsigned int		resblks;
+};
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb);
+#else
+# define xrep_setup_rtbitmap(sc, rtb)	(0)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_RTBITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
new file mode 100644
index 000000000000..46f5d5f605c9
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/rtbitmap.h"
+
+/* Set up to repair the realtime bitmap file metadata. */
+int
+xrep_setup_rtbitmap(
+	struct xfs_scrub	*sc,
+	struct xchk_rtbitmap	*rtb)
+{
+	struct xfs_mount	*mp = sc->mp;
+	unsigned long long	blocks = 0;
+
+	/*
+	 * Reserve enough blocks to write out a completely new bmbt for a
+	 * maximally fragmented bitmap file.  We do not hold the rtbitmap
+	 * ILOCK yet, so this is entirely speculative.
+	 */
+	blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks);
+	if (blocks > UINT_MAX)
+		return -EOPNOTSUPP;
+
+	rtb->resblks += blocks;
+	return 0;
+}
+
+/*
+ * Make sure that the given range of the data fork of the realtime file is
+ * mapped to written blocks.  The caller must ensure that the inode is joined
+ * to the transaction.
+ */
+STATIC int
+xrep_rtbitmap_data_mappings(
+	struct xfs_scrub	*sc,
+	xfs_filblks_t		len)
+{
+	struct xfs_bmbt_irec	map;
+	xfs_fileoff_t		off = 0;
+	int			error;
+
+	ASSERT(sc->ip != NULL);
+
+	while (off < len) {
+		int		nmaps = 1;
+
+		/*
+		 * If we have a real extent mapping this block then we're
+		 * in ok shape.
+		 */
+		error = xfs_bmapi_read(sc->ip, off, len - off, &map, &nmaps,
+				XFS_DATA_FORK);
+		if (error)
+			return error;
+		if (nmaps == 0) {
+			ASSERT(nmaps != 0);
+			return -EFSCORRUPTED;
+		}
+
+		/*
+		 * Written extents are ok.  Holes are not filled because we
+		 * do not know the freespace information.
+		 */
+		if (xfs_bmap_is_written_extent(&map) ||
+		    map.br_startblock == HOLESTARTBLOCK) {
+			off = map.br_startoff + map.br_blockcount;
+			continue;
+		}
+
+		/*
+		 * If we find a delalloc reservation then something is very
+		 * very wrong.  Bail out.
+		 */
+		if (map.br_startblock == DELAYSTARTBLOCK)
+			return -EFSCORRUPTED;
+
+		/* Make sure we're really converting an unwritten extent. */
+		if (map.br_state != XFS_EXT_UNWRITTEN) {
+			ASSERT(map.br_state == XFS_EXT_UNWRITTEN);
+			return -EFSCORRUPTED;
+		}
+
+		/* Make sure this block has a real zeroed extent mapped. */
+		nmaps = 1;
+		error = xfs_bmapi_write(sc->tp, sc->ip, map.br_startoff,
+				map.br_blockcount,
+				XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO,
+				0, &map, &nmaps);
+		if (error)
+			return error;
+		if (nmaps != 1)
+			return -EFSCORRUPTED;
+
+		/* Commit new extent and all deferred work. */
+		error = xrep_defer_finish(sc);
+		if (error)
+			return error;
+
+		off = map.br_startoff + map.br_blockcount;
+	}
+
+	return 0;
+}
+
+/* Fix broken rt volume geometry. */
+STATIC int
+xrep_rtbitmap_geometry(
+	struct xfs_scrub	*sc,
+	struct xchk_rtbitmap	*rtb)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_trans	*tp = sc->tp;
+
+	/* Superblock fields */
+	if (mp->m_sb.sb_rextents != rtb->rextents)
+		xfs_trans_mod_sb(sc->tp, XFS_TRANS_SB_REXTENTS,
+				rtb->rextents - mp->m_sb.sb_rextents);
+
+	if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+				rtb->rbmblocks - mp->m_sb.sb_rbmblocks);
+
+	if (mp->m_sb.sb_rextslog != rtb->rextslog)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+				rtb->rextslog - mp->m_sb.sb_rextslog);
+
+	/* Fix broken isize */
+	sc->ip->i_disk_size = roundup_64(sc->ip->i_disk_size,
+					 mp->m_sb.sb_blocksize);
+
+	if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks))
+		sc->ip->i_disk_size = XFS_FSB_TO_B(mp, rtb->rbmblocks);
+
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	return xrep_roll_trans(sc);
+}
+
+/* Repair the realtime bitmap file metadata. */
+int
+xrep_rtbitmap(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_rtbitmap	*rtb = sc->buf;
+	struct xfs_mount	*mp = sc->mp;
+	unsigned long long	blocks = 0;
+	int			error;
+
+	/* Impossibly large rtbitmap means we can't touch the filesystem. */
+	if (rtb->rbmblocks > U32_MAX)
+		return 0;
+
+	/*
+	 * If the size of the rt bitmap file is larger than what we reserved,
+	 * figure out if we need to adjust the block reservation in the
+	 * transaction.
+	 */
+	blocks = xfs_bmbt_calc_size(mp, rtb->rbmblocks);
+	if (blocks > UINT_MAX)
+		return -EOPNOTSUPP;
+	if (blocks > rtb->resblks) {
+		error = xfs_trans_reserve_more(sc->tp, blocks, 0);
+		if (error)
+			return error;
+
+		rtb->resblks += blocks;
+	}
+
+	/* Fix inode core and forks. */
+	error = xrep_metadata_inode_forks(sc);
+	if (error)
+		return error;
+
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+	/* Ensure no unwritten extents. */
+	error = xrep_rtbitmap_data_mappings(sc, rtb->rbmblocks);
+	if (error)
+		return error;
+
+	/* Fix inconsistent bitmap geometry */
+	return xrep_rtbitmap_geometry(sc, rtb);
+}
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 8b15c47408d0..fabd0ed9dfa6 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -31,6 +31,18 @@
  * (potentially large) amount of data in pageable memory.
  */
 
+struct xchk_rtsummary {
+	struct xfs_rtalloc_args	args;
+
+	uint64_t		rextents;
+	uint64_t		rbmblocks;
+	uint64_t		rsumsize;
+	unsigned int		rsumlevels;
+
+	/* Memory buffer for the summary comparison. */
+	union xfs_suminfo_raw	words[];
+};
+
 /* Set us up to check the rtsummary file. */
 int
 xchk_setup_rtsummary(
@@ -38,8 +50,15 @@ xchk_setup_rtsummary(
 {
 	struct xfs_mount	*mp = sc->mp;
 	char			*descr;
+	struct xchk_rtsummary	*rts;
 	int			error;
 
+	rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize),
+			XCHK_GFP_FLAGS);
+	if (!rts)
+		return -ENOMEM;
+	sc->buf = rts;
+
 	/*
 	 * Create an xfile to construct a new rtsummary file.  The xfile allows
 	 * us to avoid pinning kernel memory for this purpose.
@@ -54,15 +73,14 @@ xchk_setup_rtsummary(
 	if (error)
 		return error;
 
-	/* Allocate a memory buffer for the summary comparison. */
-	sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS);
-	if (!sc->buf)
-		return -ENOMEM;
-
 	error = xchk_install_live_inode(sc, mp->m_rsumip);
 	if (error)
 		return error;
 
+	error = xchk_ino_dqattach(sc);
+	if (error)
+		return error;
+
 	/*
 	 * Locking order requires us to take the rtbitmap first.  We must be
 	 * careful to unlock it ourselves when we are done with the rtbitmap
@@ -71,13 +89,29 @@ xchk_setup_rtsummary(
 	 */
 	xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
 	xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+
+	/*
+	 * Now that we've locked the rtbitmap and rtsummary, we can't race with
+	 * growfsrt trying to expand the summary or change the size of the rt
+	 * volume.  Hence it is safe to compute and check the geometry values.
+	 */
+	if (mp->m_sb.sb_rblocks) {
+		xfs_filblks_t	rsumblocks;
+		int		rextslog;
+
+		rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+		rextslog = xfs_compute_rextslog(rts->rextents);
+		rts->rsumlevels = rextslog + 1;
+		rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents);
+		rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels,
+				rts->rbmblocks);
+		rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
+	}
 	return 0;
 }
 
 /* Helper functions to record suminfo words in an xfile. */
 
-typedef unsigned int xchk_rtsumoff_t;
-
 static inline int
 xfsum_load(
 	struct xfs_scrub	*sc,
@@ -143,7 +177,7 @@ xchk_rtsum_record_free(
 
 	/* Compute the relevant location in the rtsum file. */
 	rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext);
-	lenlog = XFS_RTBLOCKLOG(rec->ar_extcount);
+	lenlog = xfs_highbit64(rec->ar_extcount);
 	offs = xfs_rtsumoffs(mp, lenlog, rbmoff);
 
 	rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
@@ -188,19 +222,29 @@ STATIC int
 xchk_rtsum_compare(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_rtalloc_args args = {
-		.mp		= sc->mp,
-		.tp		= sc->tp,
-	};
-	struct xfs_mount	*mp = sc->mp;
 	struct xfs_bmbt_irec	map;
-	xfs_fileoff_t		off;
-	xchk_rtsumoff_t		sumoff = 0;
-	int			nmap;
+	struct xfs_iext_cursor	icur;
+
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_inode	*ip = sc->ip;
+	struct xchk_rtsummary	*rts = sc->buf;
+	xfs_fileoff_t		off = 0;
+	xfs_fileoff_t		endoff;
+	xfs_rtsumoff_t		sumoff = 0;
+	int			error = 0;
+
+	rts->args.mp = sc->mp;
+	rts->args.tp = sc->tp;
+
+	/* Mappings may not cross or lie beyond EOF. */
+	endoff = XFS_B_TO_FSB(mp, ip->i_disk_size);
+	if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff);
+		return 0;
+	}
 
-	for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) {
-		union xfs_suminfo_raw *ondisk_info;
-		int		error = 0;
+	while (off < endoff) {
+		int		nmap = 1;
 
 		if (xchk_should_terminate(sc, &error))
 			return error;
@@ -208,8 +252,7 @@ xchk_rtsum_compare(
 			return 0;
 
 		/* Make sure we have a written extent. */
-		nmap = 1;
-		error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap,
+		error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap,
 				XFS_DATA_FORK);
 		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
 			return error;
@@ -219,24 +262,33 @@ xchk_rtsum_compare(
 			return 0;
 		}
 
+		off += map.br_blockcount;
+	}
+
+	for (off = 0; off < endoff; off++) {
+		union xfs_suminfo_raw	*ondisk_info;
+
 		/* Read a block's worth of ondisk rtsummary file. */
-		error = xfs_rtsummary_read_buf(&args, off);
+		error = xfs_rtsummary_read_buf(&rts->args, off);
 		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
 			return error;
 
 		/* Read a block's worth of computed rtsummary file. */
-		error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize);
+		error = xfsum_copyout(sc, sumoff, rts->words, mp->m_blockwsize);
 		if (error) {
-			xfs_rtbuf_cache_relse(&args);
+			xfs_rtbuf_cache_relse(&rts->args);
 			return error;
 		}
 
-		ondisk_info = xfs_rsumblock_infoptr(&args, 0);
-		if (memcmp(ondisk_info, sc->buf,
-					mp->m_blockwsize << XFS_WORDLOG) != 0)
+		ondisk_info = xfs_rsumblock_infoptr(&rts->args, 0);
+		if (memcmp(ondisk_info, rts->words,
+					mp->m_blockwsize << XFS_WORDLOG) != 0) {
 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+			xfs_rtbuf_cache_relse(&rts->args);
+			return error;
+		}
 
-		xfs_rtbuf_cache_relse(&args);
+		xfs_rtbuf_cache_relse(&rts->args);
 		sumoff += mp->m_blockwsize;
 	}
 
@@ -249,8 +301,43 @@ xchk_rtsummary(
 	struct xfs_scrub	*sc)
 {
 	struct xfs_mount	*mp = sc->mp;
+	struct xchk_rtsummary	*rts = sc->buf;
 	int			error = 0;
 
+	/* Is sb_rextents correct? */
+	if (mp->m_sb.sb_rextents != rts->rextents) {
+		xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+		goto out_rbm;
+	}
+
+	/* Is m_rsumlevels correct? */
+	if (mp->m_rsumlevels != rts->rsumlevels) {
+		xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+		goto out_rbm;
+	}
+
+	/* Is m_rsumsize correct? */
+	if (mp->m_rsumsize != rts->rsumsize) {
+		xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+		goto out_rbm;
+	}
+
+	/* The summary file length must be aligned to an fsblock. */
+	if (mp->m_rsumip->i_disk_size & mp->m_blockmask) {
+		xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+		goto out_rbm;
+	}
+
+	/*
+	 * Is the summary file itself large enough to handle the rt volume?
+	 * growfsrt expands the summary file before updating sb_rextents, so
+	 * the file can be larger than rsumsize.
+	 */
+	if (mp->m_rsumip->i_disk_size < rts->rsumsize) {
+		xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+		goto out_rbm;
+	}
+
 	/* Invoke the fork scrubber. */
 	error = xchk_metadata_inode_forks(sc);
 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4849efcaa33a..caf324c2b991 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -14,8 +14,6 @@
 #include "xfs_inode.h"
 #include "xfs_quota.h"
 #include "xfs_qm.h"
-#include "xfs_errortag.h"
-#include "xfs_error.h"
 #include "xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -238,27 +236,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 	[XFS_SCRUB_TYPE_BNOBT] = {	/* bnobt */
 		.type	= ST_PERAG,
 		.setup	= xchk_setup_ag_allocbt,
-		.scrub	= xchk_bnobt,
-		.repair	= xrep_notsupported,
+		.scrub	= xchk_allocbt,
+		.repair	= xrep_allocbt,
+		.repair_eval = xrep_revalidate_allocbt,
 	},
 	[XFS_SCRUB_TYPE_CNTBT] = {	/* cntbt */
 		.type	= ST_PERAG,
 		.setup	= xchk_setup_ag_allocbt,
-		.scrub	= xchk_cntbt,
-		.repair	= xrep_notsupported,
+		.scrub	= xchk_allocbt,
+		.repair	= xrep_allocbt,
+		.repair_eval = xrep_revalidate_allocbt,
 	},
 	[XFS_SCRUB_TYPE_INOBT] = {	/* inobt */
 		.type	= ST_PERAG,
 		.setup	= xchk_setup_ag_iallocbt,
-		.scrub	= xchk_inobt,
-		.repair	= xrep_notsupported,
+		.scrub	= xchk_iallocbt,
+		.repair	= xrep_iallocbt,
+		.repair_eval = xrep_revalidate_iallocbt,
 	},
 	[XFS_SCRUB_TYPE_FINOBT] = {	/* finobt */
 		.type	= ST_PERAG,
 		.setup	= xchk_setup_ag_iallocbt,
-		.scrub	= xchk_finobt,
+		.scrub	= xchk_iallocbt,
 		.has	= xfs_has_finobt,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_iallocbt,
+		.repair_eval = xrep_revalidate_iallocbt,
 	},
 	[XFS_SCRUB_TYPE_RMAPBT] = {	/* rmapbt */
 		.type	= ST_PERAG,
@@ -272,31 +274,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.setup	= xchk_setup_ag_refcountbt,
 		.scrub	= xchk_refcountbt,
 		.has	= xfs_has_reflink,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_refcountbt,
 	},
 	[XFS_SCRUB_TYPE_INODE] = {	/* inode record */
 		.type	= ST_INODE,
 		.setup	= xchk_setup_inode,
 		.scrub	= xchk_inode,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_inode,
 	},
 	[XFS_SCRUB_TYPE_BMBTD] = {	/* inode data fork */
 		.type	= ST_INODE,
 		.setup	= xchk_setup_inode_bmap,
 		.scrub	= xchk_bmap_data,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_bmap_data,
 	},
 	[XFS_SCRUB_TYPE_BMBTA] = {	/* inode attr fork */
 		.type	= ST_INODE,
 		.setup	= xchk_setup_inode_bmap,
 		.scrub	= xchk_bmap_attr,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_bmap_attr,
 	},
 	[XFS_SCRUB_TYPE_BMBTC] = {	/* inode CoW fork */
 		.type	= ST_INODE,
 		.setup	= xchk_setup_inode_bmap,
 		.scrub	= xchk_bmap_cow,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_bmap_cow,
 	},
 	[XFS_SCRUB_TYPE_DIR] = {	/* directory */
 		.type	= ST_INODE,
@@ -326,33 +328,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.type	= ST_FS,
 		.setup	= xchk_setup_rtbitmap,
 		.scrub	= xchk_rtbitmap,
-		.has	= xfs_has_realtime,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_rtbitmap,
 	},
 	[XFS_SCRUB_TYPE_RTSUM] = {	/* realtime summary */
 		.type	= ST_FS,
 		.setup	= xchk_setup_rtsummary,
 		.scrub	= xchk_rtsummary,
-		.has	= xfs_has_realtime,
 		.repair	= xrep_notsupported,
 	},
 	[XFS_SCRUB_TYPE_UQUOTA] = {	/* user quota */
 		.type	= ST_FS,
 		.setup	= xchk_setup_quota,
 		.scrub	= xchk_quota,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_quota,
 	},
 	[XFS_SCRUB_TYPE_GQUOTA] = {	/* group quota */
 		.type	= ST_FS,
 		.setup	= xchk_setup_quota,
 		.scrub	= xchk_quota,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_quota,
 	},
 	[XFS_SCRUB_TYPE_PQUOTA] = {	/* project quota */
 		.type	= ST_FS,
 		.setup	= xchk_setup_quota,
 		.scrub	= xchk_quota,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_quota,
 	},
 	[XFS_SCRUB_TYPE_FSCOUNTERS] = {	/* fs summary counters */
 		.type	= ST_FS,
@@ -531,7 +531,10 @@ retry_op:
 
 	/* Scrub for errors. */
 	check_start = xchk_stats_now();
-	error = sc->ops->scrub(sc);
+	if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL)
+		error = sc->ops->repair_eval(sc);
+	else
+		error = sc->ops->scrub(sc);
 	run.scrub_ns += xchk_stats_elapsed_ns(check_start);
 	if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
 		goto try_harder;
@@ -542,23 +545,12 @@ retry_op:
 
 	xchk_update_health(sc);
 
-	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
-	    !(sc->flags & XREP_ALREADY_FIXED)) {
-		bool needs_fix = xchk_needs_repair(sc->sm);
-
-		/* Userspace asked us to rebuild the structure regardless. */
-		if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
-			needs_fix = true;
-
-		/* Let debug users force us into the repair routines. */
-		if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
-			needs_fix = true;
-
+	if (xchk_could_repair(sc)) {
 		/*
 		 * If userspace asked for a repair but it wasn't necessary,
 		 * report that back to userspace.
 		 */
-		if (!needs_fix) {
+		if (!xrep_will_attempt(sc)) {
 			sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
 			goto out_nofix;
 		}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 1ef9c6b4842a..7fc50654c4fe 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -35,6 +35,14 @@ struct xchk_meta_ops {
 	/* Repair or optimize the metadata. */
 	int		(*repair)(struct xfs_scrub *);
 
+	/*
+	 * Re-scrub the metadata we repaired, in case there's extra work that
+	 * we need to do to check our repair work.  If this is NULL, we'll use
+	 * the ->scrub function pointer, assuming that the regular scrub is
+	 * sufficient.
+	 */
+	int		(*repair_eval)(struct xfs_scrub *sc);
+
 	/* Decide if we even have this piece of metadata. */
 	bool		(*has)(struct xfs_mount *);
 
@@ -113,6 +121,7 @@ struct xfs_scrub {
 #define XCHK_HAVE_FREEZE_PROT	(1U << 1)  /* do we have freeze protection? */
 #define XCHK_FSGATES_DRAIN	(1U << 2)  /* defer ops draining enabled */
 #define XCHK_NEED_DRAIN		(1U << 3)  /* scrub needs to drain defer ops */
+#define XREP_RESET_PERAG_RESV	(1U << 30) /* must reset AG space reservation */
 #define XREP_ALREADY_FIXED	(1U << 31) /* checking our repair work */
 
 /*
@@ -129,10 +138,8 @@ int xchk_superblock(struct xfs_scrub *sc);
 int xchk_agf(struct xfs_scrub *sc);
 int xchk_agfl(struct xfs_scrub *sc);
 int xchk_agi(struct xfs_scrub *sc);
-int xchk_bnobt(struct xfs_scrub *sc);
-int xchk_cntbt(struct xfs_scrub *sc);
-int xchk_inobt(struct xfs_scrub *sc);
-int xchk_finobt(struct xfs_scrub *sc);
+int xchk_allocbt(struct xfs_scrub *sc);
+int xchk_iallocbt(struct xfs_scrub *sc);
 int xchk_rmapbt(struct xfs_scrub *sc);
 int xchk_refcountbt(struct xfs_scrub *sc);
 int xchk_inode(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 38708fb9a5d7..ddff86713df3 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -12,8 +12,10 @@
 #include "xfs_log_format.h"
 #include "xfs_inode.h"
 #include "xfs_symlink.h"
+#include "xfs_health.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
+#include "scrub/health.h"
 
 /* Set us up to scrub a symbolic link. */
 int
@@ -41,29 +43,37 @@ xchk_symlink(
 
 	if (!S_ISLNK(VFS_I(ip)->i_mode))
 		return -ENOENT;
+
+	if (xchk_file_looks_zapped(sc, XFS_SICK_INO_SYMLINK_ZAPPED)) {
+		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+		return 0;
+	}
+
 	ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
 	len = ip->i_disk_size;
 
 	/* Plausible size? */
 	if (len > XFS_SYMLINK_MAXLEN || len <= 0) {
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-		goto out;
+		return 0;
 	}
 
 	/* Inline symlink? */
 	if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
 		if (len > xfs_inode_data_fork_size(ip) ||
-		    len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip)))
+		    len > strnlen(ifp->if_data, xfs_inode_data_fork_size(ip)))
 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-		goto out;
+		return 0;
 	}
 
 	/* Remote symlink; must read the contents. */
 	error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
-		goto out;
+		return error;
 	if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-out:
-	return error;
+
+	/* If a remote symlink is clean, it is clearly not zapped. */
+	xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED);
+	return 0;
 }
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 29afa4851235..d0e24ffaf754 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -14,9 +14,12 @@
 #include "xfs_btree.h"
 #include "xfs_ag.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_quota.h"
+#include "xfs_quota_defs.h"
 #include "scrub/scrub.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
+#include "scrub/quota.h"
 
 /* Figure out which block the btree cursor was pointing to. */
 static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 4a8bc6f3c8f2..6bbb4e8639dc 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -19,6 +19,7 @@
 struct xfile;
 struct xfarray;
 struct xfarray_sortinfo;
+struct xchk_dqiter;
 
 /*
  * ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -106,6 +107,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
 	{ XCHK_HAVE_FREEZE_PROT,		"nofreeze" }, \
 	{ XCHK_FSGATES_DRAIN,			"fsgates_drain" }, \
 	{ XCHK_NEED_DRAIN,			"need_drain" }, \
+	{ XREP_RESET_PERAG_RESV,		"reset_perag_resv" }, \
 	{ XREP_ALREADY_FIXED,			"already_fixed" }
 
 DECLARE_EVENT_CLASS(xchk_class,
@@ -347,6 +349,54 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \
 DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error);
 DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning);
 
+#ifdef CONFIG_XFS_QUOTA
+DECLARE_EVENT_CLASS(xchk_dqiter_class,
+	TP_PROTO(struct xchk_dqiter *cursor, uint64_t id),
+	TP_ARGS(cursor, id),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_dqtype_t, dqtype)
+		__field(xfs_ino_t, ino)
+		__field(unsigned long long, cur_id)
+		__field(unsigned long long, id)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+		__field(xfs_exntst_t, state)
+	),
+	TP_fast_assign(
+		__entry->dev = cursor->sc->ip->i_mount->m_super->s_dev;
+		__entry->dqtype = cursor->dqtype;
+		__entry->ino = cursor->quota_ip->i_ino;
+		__entry->cur_id = cursor->id;
+		__entry->startoff = cursor->bmap.br_startoff;
+		__entry->startblock = cursor->bmap.br_startblock;
+		__entry->blockcount = cursor->bmap.br_blockcount;
+		__entry->state = cursor->bmap.br_state;
+		__entry->id = id;
+	),
+	TP_printk("dev %d:%d dquot type %s ino 0x%llx cursor_id 0x%llx startoff 0x%llx startblock 0x%llx blockcount 0x%llx state %u id 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS),
+		  __entry->ino,
+		  __entry->cur_id,
+		  __entry->startoff,
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->state,
+		  __entry->id)
+);
+
+#define DEFINE_SCRUB_DQITER_EVENT(name) \
+DEFINE_EVENT(xchk_dqiter_class, name, \
+	TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), \
+	TP_ARGS(cursor, id))
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter);
+#endif /* CONFIG_XFS_QUOTA */
+
 TRACE_EVENT(xchk_incomplete,
 	TP_PROTO(struct xfs_scrub *sc, void *ret_ip),
 	TP_ARGS(sc, ret_ip),
@@ -1172,37 +1222,125 @@ DEFINE_EVENT(xrep_rmap_class, name, \
 		 xfs_agblock_t agbno, xfs_extlen_t len, \
 		 uint64_t owner, uint64_t offset, unsigned int flags), \
 	TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn);
-DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap);
 DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
-DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap);
 
-TRACE_EVENT(xrep_refcount_extent_fn,
+TRACE_EVENT(xrep_abt_found,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 struct xfs_refcount_irec *irec),
-	TP_ARGS(mp, agno, irec),
+		 const struct xfs_alloc_rec_incore *rec),
+	TP_ARGS(mp, agno, rec),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, startblock)
 		__field(xfs_extlen_t, blockcount)
-		__field(xfs_nlink_t, refcount)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->agno = agno;
-		__entry->startblock = irec->rc_startblock;
-		__entry->blockcount = irec->rc_blockcount;
-		__entry->refcount = irec->rc_refcount;
+		__entry->startblock = rec->ar_startblock;
+		__entry->blockcount = rec->ar_blockcount;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startblock,
+		  __entry->blockcount)
+)
+
+TRACE_EVENT(xrep_ibt_found,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 const struct xfs_inobt_rec_incore *rec),
+	TP_ARGS(mp, agno, rec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+		__field(uint16_t, holemask)
+		__field(uint8_t, count)
+		__field(uint8_t, freecount)
+		__field(uint64_t, freemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = rec->ir_startino;
+		__entry->holemask = rec->ir_holemask;
+		__entry->count = rec->ir_count;
+		__entry->freecount = rec->ir_freecount;
+		__entry->freemask = rec->ir_free;
+	),
+	TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startino,
+		  __entry->holemask,
+		  __entry->count,
+		  __entry->freecount,
+		  __entry->freemask)
+)
+
+TRACE_EVENT(xrep_refc_found,
+	TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec),
+	TP_ARGS(pag, rec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(enum xfs_refc_domain, domain)
+		__field(xfs_agblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+		__entry->domain = rec->rc_domain;
+		__entry->startblock = rec->rc_startblock;
+		__entry->blockcount = rec->rc_blockcount;
+		__entry->refcount = rec->rc_refcount;
 	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
+	TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
+		  __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
 		  __entry->startblock,
 		  __entry->blockcount,
 		  __entry->refcount)
 )
 
+TRACE_EVENT(xrep_bmap_found,
+	TP_PROTO(struct xfs_inode *ip, int whichfork,
+		 struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, whichfork, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, whichfork)
+		__field(xfs_fileoff_t, lblk)
+		__field(xfs_filblks_t, len)
+		__field(xfs_fsblock_t, pblk)
+		__field(int, state)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->whichfork = whichfork;
+		__entry->lblk = irec->br_startoff;
+		__entry->len = irec->br_blockcount;
+		__entry->pblk = irec->br_startblock;
+		__entry->state = irec->br_state;
+	),
+	TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __entry->lblk,
+		  __entry->len,
+		  __entry->pblk,
+		  __entry->state)
+);
+
 TRACE_EVENT(xrep_findroot_block,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
 		 uint32_t magic, uint16_t level),
@@ -1299,39 +1437,327 @@ TRACE_EVENT(xrep_reset_counters,
 		  MAJOR(__entry->dev), MINOR(__entry->dev))
 )
 
-TRACE_EVENT(xrep_ialloc_insert,
+DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 xfs_agino_t startino, uint16_t holemask, uint8_t count,
-		 uint8_t freecount, uint64_t freemask),
-	TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask),
+		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 int64_t owner),
+	TP_ARGS(mp, agno, agbno, len, owner),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
-		__field(xfs_agino_t, startino)
-		__field(uint16_t, holemask)
-		__field(uint8_t, count)
-		__field(uint8_t, freecount)
-		__field(uint64_t, freemask)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(int64_t, owner)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->agno = agno;
-		__entry->startino = startino;
-		__entry->holemask = holemask;
-		__entry->count = count;
-		__entry->freecount = freecount;
-		__entry->freemask = freemask;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->owner = owner;
 	),
-	TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx",
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
-		  __entry->startino,
-		  __entry->holemask,
-		  __entry->count,
-		  __entry->freecount,
-		  __entry->freemask)
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->owner)
+);
+#define DEFINE_NEWBT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xrep_newbt_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len, \
+		 int64_t owner), \
+	TP_ARGS(mp, agno, agbno, len, owner))
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block);
+
+DECLARE_EVENT_CLASS(xrep_dinode_class,
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip),
+	TP_ARGS(sc, dip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(uint16_t, mode)
+		__field(uint8_t, version)
+		__field(uint8_t, format)
+		__field(uint32_t, uid)
+		__field(uint32_t, gid)
+		__field(uint64_t, size)
+		__field(uint64_t, nblocks)
+		__field(uint32_t, extsize)
+		__field(uint32_t, nextents)
+		__field(uint16_t, anextents)
+		__field(uint8_t, forkoff)
+		__field(uint8_t, aformat)
+		__field(uint16_t, flags)
+		__field(uint32_t, gen)
+		__field(uint64_t, flags2)
+		__field(uint32_t, cowextsize)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = sc->sm->sm_ino;
+		__entry->mode = be16_to_cpu(dip->di_mode);
+		__entry->version = dip->di_version;
+		__entry->format = dip->di_format;
+		__entry->uid = be32_to_cpu(dip->di_uid);
+		__entry->gid = be32_to_cpu(dip->di_gid);
+		__entry->size = be64_to_cpu(dip->di_size);
+		__entry->nblocks = be64_to_cpu(dip->di_nblocks);
+		__entry->extsize = be32_to_cpu(dip->di_extsize);
+		__entry->nextents = be32_to_cpu(dip->di_nextents);
+		__entry->anextents = be16_to_cpu(dip->di_anextents);
+		__entry->forkoff = dip->di_forkoff;
+		__entry->aformat = dip->di_aformat;
+		__entry->flags = be16_to_cpu(dip->di_flags);
+		__entry->gen = be32_to_cpu(dip->di_gen);
+		__entry->flags2 = be64_to_cpu(dip->di_flags2);
+		__entry->cowextsize = be32_to_cpu(dip->di_cowextsize);
+	),
+	TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->mode,
+		  __entry->version,
+		  __entry->format,
+		  __entry->uid,
+		  __entry->gid,
+		  __entry->size,
+		  __entry->nblocks,
+		  __entry->extsize,
+		  __entry->nextents,
+		  __entry->anextents,
+		  __entry->forkoff,
+		  __entry->aformat,
+		  __entry->flags,
+		  __entry->gen,
+		  __entry->flags2,
+		  __entry->cowextsize)
 )
 
+#define DEFINE_REPAIR_DINODE_EVENT(name) \
+DEFINE_EVENT(xrep_dinode_class, name, \
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \
+	TP_ARGS(sc, dip))
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_forks);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dfork);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_afork);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_ensure_forkoff);
+
+DECLARE_EVENT_CLASS(xrep_inode_class,
+	TP_PROTO(struct xfs_scrub *sc),
+	TP_ARGS(sc),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_rfsblock_t, nblocks)
+		__field(uint16_t, flags)
+		__field(uint64_t, flags2)
+		__field(uint32_t, nextents)
+		__field(uint8_t, format)
+		__field(uint32_t, anextents)
+		__field(uint8_t, aformat)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = sc->sm->sm_ino;
+		__entry->size = sc->ip->i_disk_size;
+		__entry->nblocks = sc->ip->i_nblocks;
+		__entry->flags = sc->ip->i_diflags;
+		__entry->flags2 = sc->ip->i_diflags2;
+		__entry->nextents = sc->ip->i_df.if_nextents;
+		__entry->format = sc->ip->i_df.if_format;
+		__entry->anextents = sc->ip->i_af.if_nextents;
+		__entry->aformat = sc->ip->i_af.if_format;
+	),
+	TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0x%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->nblocks,
+		  __entry->flags,
+		  __entry->flags2,
+		  __entry->nextents,
+		  __entry->format,
+		  __entry->anextents,
+		  __entry->aformat)
+)
+
+#define DEFINE_REPAIR_INODE_EVENT(name) \
+DEFINE_EVENT(xrep_inode_class, name, \
+	TP_PROTO(struct xfs_scrub *sc), \
+	TP_ARGS(sc))
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed);
+
+TRACE_EVENT(xrep_dinode_count_rmaps,
+	TP_PROTO(struct xfs_scrub *sc, xfs_rfsblock_t data_blocks,
+		xfs_rfsblock_t rt_blocks, xfs_rfsblock_t attr_blocks,
+		xfs_extnum_t data_extents, xfs_extnum_t rt_extents,
+		xfs_aextnum_t attr_extents),
+	TP_ARGS(sc, data_blocks, rt_blocks, attr_blocks, data_extents,
+		rt_extents, attr_extents),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_rfsblock_t, data_blocks)
+		__field(xfs_rfsblock_t, rt_blocks)
+		__field(xfs_rfsblock_t, attr_blocks)
+		__field(xfs_extnum_t, data_extents)
+		__field(xfs_extnum_t, rt_extents)
+		__field(xfs_aextnum_t, attr_extents)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = sc->sm->sm_ino;
+		__entry->data_blocks = data_blocks;
+		__entry->rt_blocks = rt_blocks;
+		__entry->attr_blocks = attr_blocks;
+		__entry->data_extents = data_extents;
+		__entry->rt_extents = rt_extents;
+		__entry->attr_extents = attr_extents;
+	),
+	TP_printk("dev %d:%d ino 0x%llx dblocks 0x%llx rtblocks 0x%llx ablocks 0x%llx dextents %llu rtextents %llu aextents %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->data_blocks,
+		  __entry->rt_blocks,
+		  __entry->attr_blocks,
+		  __entry->data_extents,
+		  __entry->rt_extents,
+		  __entry->attr_extents)
+);
+
+TRACE_EVENT(xrep_cow_mark_file_range,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock,
+		 xfs_fileoff_t startoff, xfs_filblks_t blockcount),
+	TP_ARGS(ip, startblock, startoff, blockcount),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_filblks_t, blockcount)
+	),
+	TP_fast_assign(
+		__entry->dev = ip->i_mount->m_super->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->startoff = startoff;
+		__entry->startblock = startblock;
+		__entry->blockcount = blockcount;
+	),
+	TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->startoff,
+		  __entry->startblock,
+		  __entry->blockcount)
+);
+
+TRACE_EVENT(xrep_cow_replace_mapping,
+	TP_PROTO(struct xfs_inode *ip, const struct xfs_bmbt_irec *irec,
+		 xfs_fsblock_t new_startblock, xfs_extlen_t new_blockcount),
+	TP_ARGS(ip, irec, new_startblock, new_blockcount),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_filblks_t, blockcount)
+		__field(xfs_exntst_t, state)
+		__field(xfs_fsblock_t, new_startblock)
+		__field(xfs_extlen_t, new_blockcount)
+	),
+	TP_fast_assign(
+		__entry->dev = ip->i_mount->m_super->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->startoff = irec->br_startoff;
+		__entry->startblock = irec->br_startblock;
+		__entry->blockcount = irec->br_blockcount;
+		__entry->state = irec->br_state;
+		__entry->new_startblock = new_startblock;
+		__entry->new_blockcount = new_blockcount;
+	),
+	TP_printk("dev %d:%d ino 0x%llx startoff 0x%llx startblock 0x%llx fsbcount 0x%llx state 0x%x new_startblock 0x%llx new_fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->startoff,
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->state,
+		  __entry->new_startblock,
+		  __entry->new_blockcount)
+);
+
+TRACE_EVENT(xrep_cow_free_staging,
+	TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno,
+		 xfs_extlen_t blockcount),
+	TP_ARGS(pag, agbno, blockcount),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, blockcount)
+	),
+	TP_fast_assign(
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+		__entry->agbno = agbno;
+		__entry->blockcount = blockcount;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->blockcount)
+);
+
+#ifdef CONFIG_XFS_QUOTA
+DECLARE_EVENT_CLASS(xrep_dquot_class,
+	TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id),
+	TP_ARGS(mp, type, id),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(uint8_t, type)
+		__field(uint32_t, id)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->id = id;
+		__entry->type = type;
+	),
+	TP_printk("dev %d:%d type %s id 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+		  __entry->id)
+);
+
+#define DEFINE_XREP_DQUOT_EVENT(name) \
+DEFINE_EVENT(xrep_dquot_class, name, \
+	TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), \
+	TP_ARGS(mp, type, id))
+DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item);
+DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot);
+DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole);
+#endif /* CONFIG_XFS_QUOTA */
+
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
 
 #endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index 4ecac01363d9..62b9c506fdd1 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -54,6 +54,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr)
 uint64_t xfarray_length(struct xfarray *array);
 int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec);
 
+/*
+ * Iterate the non-null elements in a sparse xfarray.  Callers should
+ * initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it
+ * will be set to one more than the index of the record that was retrieved.
+ * Returns 1 if a record was retrieved, 0 if there weren't any more records, or
+ * a negative errno.
+ */
+static inline int
+xfarray_iter(
+	struct xfarray	*array,
+	xfarray_idx_t	*idx,
+	void		*rec)
+{
+	int ret = xfarray_load_next(array, idx, rec);
+
+	if (ret == -ENODATA)
+		return 0;
+	if (ret == 0)
+		return 1;
+	return ret;
+}
+
 /* Declarations for xfile array sort functionality. */
 
 typedef cmp_func_t xfarray_cmp_fn;
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 36fe2abb16e6..9e02111bd890 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -33,8 +33,6 @@ struct kmem_cache		*xfs_attrd_cache;
 
 static const struct xfs_item_ops xfs_attri_item_ops;
 static const struct xfs_item_ops xfs_attrd_item_ops;
-static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
-					struct xfs_attri_log_item *attrip);
 
 static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
 {
@@ -310,47 +308,6 @@ xfs_attrd_item_intent(
 	return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
 }
 
-/*
- * Performs one step of an attribute update intent and marks the attrd item
- * dirty..  An attr operation may be a set or a remove.  Note that the
- * transaction is marked dirty regardless of whether the operation succeeds or
- * fails to support the ATTRI/ATTRD lifecycle rules.
- */
-STATIC int
-xfs_xattri_finish_update(
-	struct xfs_attr_intent		*attr,
-	struct xfs_attrd_log_item	*attrdp)
-{
-	struct xfs_da_args		*args = attr->xattri_da_args;
-	int				error;
-
-	if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
-		error = -EIO;
-		goto out;
-	}
-
-	error = xfs_attr_set_iter(attr);
-	if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
-		error = -EAGAIN;
-out:
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the ATTRI and frees the ATTRD
-	 * 2.) shuts down the filesystem
-	 */
-	args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
-
-	/*
-	 * attr intent/done items are null when logged attributes are disabled
-	 */
-	if (attrdp)
-		set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
-
-	return error;
-}
-
 /* Log an attr to the intent item. */
 STATIC void
 xfs_attr_log_item(
@@ -360,9 +317,6 @@ xfs_attr_log_item(
 {
 	struct xfs_attri_log_format	*attrp;
 
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
-
 	/*
 	 * At this point the xfs_attr_intent has been constructed, and we've
 	 * created the log intent. Fill in the attri log item and log format
@@ -419,7 +373,6 @@ xfs_attr_create_intent(
 	}
 
 	attrip = xfs_attri_init(mp, attr->xattri_nameval);
-	xfs_trans_add_item(tp, &attrip->attri_item);
 	xfs_attr_log_item(tp, attrip, attr);
 
 	return &attrip->attri_item;
@@ -447,23 +400,33 @@ xfs_attr_finish_item(
 	struct xfs_btree_cur		**state)
 {
 	struct xfs_attr_intent		*attr;
-	struct xfs_attrd_log_item	*done_item = NULL;
+	struct xfs_da_args		*args;
 	int				error;
 
 	attr = container_of(item, struct xfs_attr_intent, xattri_list);
-	if (done)
-		done_item = ATTRD_ITEM(done);
+	args = attr->xattri_da_args;
 
-	/*
-	 * Always reset trans after EAGAIN cycle
-	 * since the transaction is new
-	 */
-	attr->xattri_da_args->trans = tp;
+	/* Reset trans after EAGAIN cycle since the transaction is new */
+	args->trans = tp;
 
-	error = xfs_xattri_finish_update(attr, done_item);
-	if (error != -EAGAIN)
-		xfs_attr_free_item(attr);
+	if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+		error = -EIO;
+		goto out;
+	}
 
+	/* If an attr removal is trivially complete, we're done. */
+	if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE &&
+	    !xfs_inode_hasattr(args->dp)) {
+		error = 0;
+		goto out;
+	}
+
+	error = xfs_attr_set_iter(attr);
+	if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+		return -EAGAIN;
+
+out:
+	xfs_attr_free_item(attr);
 	return error;
 }
 
@@ -532,41 +495,22 @@ xfs_attri_validate(
 	return xfs_verify_ino(mp, attrp->alfi_ino);
 }
 
-/*
- * Process an attr intent item that was recovered from the log.  We need to
- * delete the attr that it describes.
- */
-STATIC int
-xfs_attri_item_recover(
-	struct xfs_log_item		*lip,
-	struct list_head		*capture_list)
+static inline struct xfs_attr_intent *
+xfs_attri_recover_work(
+	struct xfs_mount		*mp,
+	struct xfs_defer_pending	*dfp,
+	struct xfs_attri_log_format	*attrp,
+	struct xfs_inode		**ipp,
+	struct xfs_attri_log_nameval	*nv)
 {
-	struct xfs_attri_log_item	*attrip = ATTRI_ITEM(lip);
 	struct xfs_attr_intent		*attr;
-	struct xfs_mount		*mp = lip->li_log->l_mp;
-	struct xfs_inode		*ip;
 	struct xfs_da_args		*args;
-	struct xfs_trans		*tp;
-	struct xfs_trans_res		resv;
-	struct xfs_attri_log_format	*attrp;
-	struct xfs_attri_log_nameval	*nv = attrip->attri_nameval;
-	int				error;
-	int				total;
 	int				local;
-	struct xfs_attrd_log_item	*done_item = NULL;
-
-	/*
-	 * First check the validity of the attr described by the ATTRI.  If any
-	 * are bad, then assume that all are bad and just toss the ATTRI.
-	 */
-	attrp = &attrip->attri_format;
-	if (!xfs_attri_validate(mp, attrp) ||
-	    !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
-		return -EFSCORRUPTED;
+	int				error;
 
-	error = xlog_recover_iget(mp,  attrp->alfi_ino, &ip);
+	error = xlog_recover_iget(mp,  attrp->alfi_ino, ipp);
 	if (error)
-		return error;
+		return ERR_PTR(error);
 
 	attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
 			   sizeof(struct xfs_da_args), KM_NOFS);
@@ -584,7 +528,7 @@ xfs_attri_item_recover(
 	attr->xattri_nameval = xfs_attri_log_nameval_get(nv);
 	ASSERT(attr->xattri_nameval);
 
-	args->dp = ip;
+	args->dp = *ipp;
 	args->geo = mp->m_attr_geo;
 	args->whichfork = XFS_ATTR_FORK;
 	args->name = nv->name.i_addr;
@@ -608,43 +552,65 @@ xfs_attri_item_recover(
 			attr->xattri_dela_state = xfs_attr_init_add_state(args);
 		break;
 	case XFS_ATTRI_OP_FLAGS_REMOVE:
-		if (!xfs_inode_hasattr(args->dp))
-			goto out;
 		attr->xattri_dela_state = xfs_attr_init_remove_state(args);
 		break;
-	default:
-		ASSERT(0);
-		error = -EFSCORRUPTED;
-		goto out;
 	}
 
+	xfs_defer_add_item(dfp, &attr->xattri_list);
+	return attr;
+}
+
+/*
+ * Process an attr intent item that was recovered from the log.  We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attr_recover_work(
+	struct xfs_defer_pending	*dfp,
+	struct list_head		*capture_list)
+{
+	struct xfs_log_item		*lip = dfp->dfp_intent;
+	struct xfs_attri_log_item	*attrip = ATTRI_ITEM(lip);
+	struct xfs_attr_intent		*attr;
+	struct xfs_mount		*mp = lip->li_log->l_mp;
+	struct xfs_inode		*ip;
+	struct xfs_da_args		*args;
+	struct xfs_trans		*tp;
+	struct xfs_trans_res		resv;
+	struct xfs_attri_log_format	*attrp;
+	struct xfs_attri_log_nameval	*nv = attrip->attri_nameval;
+	int				error;
+	int				total;
+
+	/*
+	 * First check the validity of the attr described by the ATTRI.  If any
+	 * are bad, then assume that all are bad and just toss the ATTRI.
+	 */
+	attrp = &attrip->attri_format;
+	if (!xfs_attri_validate(mp, attrp) ||
+	    !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+		return -EFSCORRUPTED;
+
+	attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv);
+	if (IS_ERR(attr))
+		return PTR_ERR(attr);
+	args = attr->xattri_da_args;
+
 	xfs_init_attr_trans(args, &resv, &total);
 	resv = xlog_recover_resv(&resv);
 	error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp);
 	if (error)
-		goto out;
-
+		return error;
 	args->trans = tp;
-	done_item = xfs_trans_get_attrd(tp, attrip);
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
 
-	error = xfs_xattri_finish_update(attr, done_item);
-	if (error == -EAGAIN) {
-		/*
-		 * There's more work to do, so add the intent item to this
-		 * transaction so that we can continue it later.
-		 */
-		xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
-		error = xfs_defer_ops_capture_and_commit(tp, capture_list);
-		if (error)
-			goto out_unlock;
-
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_irele(ip);
-		return 0;
-	}
+	error = xlog_recover_finish_intent(tp, dfp);
+	if (error == -EFSCORRUPTED)
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				&attrip->attri_format,
+				sizeof(attrip->attri_format));
 	if (error) {
 		xfs_trans_cancel(tp);
 		goto out_unlock;
@@ -654,18 +620,16 @@ xfs_attri_item_recover(
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	xfs_irele(ip);
-out:
-	xfs_attr_free_item(attr);
 	return error;
 }
 
 /* Re-log an intent item to push the log tail forward. */
 static struct xfs_log_item *
-xfs_attri_item_relog(
+xfs_attr_relog_intent(
+	struct xfs_trans		*tp,
 	struct xfs_log_item		*intent,
-	struct xfs_trans		*tp)
+	struct xfs_log_item		*done_item)
 {
-	struct xfs_attrd_log_item	*attrdp;
 	struct xfs_attri_log_item	*old_attrip;
 	struct xfs_attri_log_item	*new_attrip;
 	struct xfs_attri_log_format	*new_attrp;
@@ -674,10 +638,6 @@ xfs_attri_item_relog(
 	old_attrip = ATTRI_ITEM(intent);
 	old_attrp = &old_attrip->attri_format;
 
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	attrdp = xfs_trans_get_attrd(tp, old_attrip);
-	set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
-
 	/*
 	 * Create a new log item that shares the same name/value buffer as the
 	 * old log item.
@@ -691,12 +651,43 @@ xfs_attri_item_relog(
 	new_attrp->alfi_name_len = old_attrp->alfi_name_len;
 	new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
 
-	xfs_trans_add_item(tp, &new_attrip->attri_item);
-	set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
-
 	return &new_attrip->attri_item;
 }
 
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*intent,
+	unsigned int			count)
+{
+	struct xfs_attri_log_item	*attrip;
+	struct xfs_attrd_log_item	*attrdp;
+
+	attrip = ATTRI_ITEM(intent);
+
+	attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+
+	xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+			  &xfs_attrd_item_ops);
+	attrdp->attrd_attrip = attrip;
+	attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+	return &attrdp->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+	.name		= "attr",
+	.max_items	= 1,
+	.create_intent	= xfs_attr_create_intent,
+	.abort_intent	= xfs_attr_abort_intent,
+	.create_done	= xfs_attr_create_done,
+	.finish_item	= xfs_attr_finish_item,
+	.cancel_item	= xfs_attr_cancel_item,
+	.recover_work	= xfs_attr_recover_work,
+	.relog_intent	= xfs_attr_relog_intent,
+};
+
 STATIC int
 xlog_recover_attri_commit_pass2(
 	struct xlog                     *log,
@@ -767,63 +758,13 @@ xlog_recover_attri_commit_pass2(
 	attrip = xfs_attri_init(mp, nv);
 	memcpy(&attrip->attri_format, attri_formatp, len);
 
-	/*
-	 * The ATTRI has two references. One for the ATTRD and one for ATTRI to
-	 * ensure it makes it into the AIL. Insert the ATTRI into the AIL
-	 * directly and drop the ATTRI reference. Note that
-	 * xfs_trans_ail_update() drops the AIL lock.
-	 */
-	xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
-	xfs_attri_release(attrip);
+	xlog_recover_intent_item(log, &attrip->attri_item, lsn,
+			&xfs_attr_defer_type);
 	xfs_attri_log_nameval_put(nv);
 	return 0;
 }
 
 /*
- * This routine is called to allocate an "attr free done" log item.
- */
-static struct xfs_attrd_log_item *
-xfs_trans_get_attrd(struct xfs_trans		*tp,
-		  struct xfs_attri_log_item	*attrip)
-{
-	struct xfs_attrd_log_item		*attrdp;
-
-	ASSERT(tp != NULL);
-
-	attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
-
-	xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
-			  &xfs_attrd_item_ops);
-	attrdp->attrd_attrip = attrip;
-	attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
-
-	xfs_trans_add_item(tp, &attrdp->attrd_item);
-	return attrdp;
-}
-
-/* Get an ATTRD so we can process all the attrs. */
-static struct xfs_log_item *
-xfs_attr_create_done(
-	struct xfs_trans		*tp,
-	struct xfs_log_item		*intent,
-	unsigned int			count)
-{
-	if (!intent)
-		return NULL;
-
-	return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
-}
-
-const struct xfs_defer_op_type xfs_attr_defer_type = {
-	.max_items	= 1,
-	.create_intent	= xfs_attr_create_intent,
-	.abort_intent	= xfs_attr_abort_intent,
-	.create_done	= xfs_attr_create_done,
-	.finish_item	= xfs_attr_finish_item,
-	.cancel_item	= xfs_attr_cancel_item,
-};
-
-/*
  * This routine is called when an ATTRD format structure is found in a committed
  * transaction in the log. Its purpose is to cancel the corresponding ATTRI if
  * it was still in the log. To do this it searches the AIL for the ATTRI with
@@ -857,9 +798,7 @@ static const struct xfs_item_ops xfs_attri_item_ops = {
 	.iop_format	= xfs_attri_item_format,
 	.iop_unpin	= xfs_attri_item_unpin,
 	.iop_release    = xfs_attri_item_release,
-	.iop_recover	= xfs_attri_item_recover,
 	.iop_match	= xfs_attri_item_match,
-	.iop_relog	= xfs_attri_item_relog,
 };
 
 const struct xlog_recover_item_ops xlog_attri_item_ops = {
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 99bbbe1a0e44..e368ad671e26 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -56,14 +56,13 @@ xfs_attr_shortform_list(
 	struct xfs_attrlist_cursor_kern	*cursor = &context->cursor;
 	struct xfs_inode		*dp = context->dp;
 	struct xfs_attr_sf_sort		*sbuf, *sbp;
-	struct xfs_attr_shortform	*sf;
+	struct xfs_attr_sf_hdr		*sf = dp->i_af.if_data;
 	struct xfs_attr_sf_entry	*sfe;
 	int				sbsize, nsbuf, count, i;
 	int				error = 0;
 
-	sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
 	ASSERT(sf != NULL);
-	if (!sf->hdr.count)
+	if (!sf->count)
 		return 0;
 
 	trace_xfs_attr_list_sf(context);
@@ -79,8 +78,8 @@ xfs_attr_shortform_list(
 	 */
 	if (context->bufsize == 0 ||
 	    (XFS_ISRESET_CURSOR(cursor) &&
-	     (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
-		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+	     (dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) {
+		for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
 			if (XFS_IS_CORRUPT(context->dp->i_mount,
 					   !xfs_attr_namecheck(sfe->nameval,
 							       sfe->namelen)))
@@ -109,7 +108,7 @@ xfs_attr_shortform_list(
 	/*
 	 * It didn't all fit, so we have to sort everything on hashval.
 	 */
-	sbsize = sf->hdr.count * sizeof(*sbuf);
+	sbsize = sf->count * sizeof(*sbuf);
 	sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
 
 	/*
@@ -117,7 +116,7 @@ xfs_attr_shortform_list(
 	 * the relevant info from only those that match into a buffer.
 	 */
 	nsbuf = 0;
-	for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+	for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
 		if (unlikely(
 		    ((char *)sfe < (char *)sf) ||
 		    ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index e736a0844c89..52fb8a148b7d 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -221,51 +221,6 @@ static const struct xfs_item_ops xfs_bud_item_ops = {
 	.iop_intent	= xfs_bud_item_intent,
 };
 
-static struct xfs_bud_log_item *
-xfs_trans_get_bud(
-	struct xfs_trans		*tp,
-	struct xfs_bui_log_item		*buip)
-{
-	struct xfs_bud_log_item		*budp;
-
-	budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
-	xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
-			  &xfs_bud_item_ops);
-	budp->bud_buip = buip;
-	budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
-
-	xfs_trans_add_item(tp, &budp->bud_item);
-	return budp;
-}
-
-/*
- * Finish an bmap update and log it to the BUD. Note that the
- * transaction is marked dirty regardless of whether the bmap update
- * succeeds or fails to support the BUI/BUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_bmap_update(
-	struct xfs_trans		*tp,
-	struct xfs_bud_log_item		*budp,
-	struct xfs_bmap_intent		*bi)
-{
-	int				error;
-
-	error = xfs_bmap_finish_one(tp, bi);
-
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the BUI and frees the BUD
-	 * 2.) shuts down the filesystem
-	 */
-	tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
-	set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
-	return error;
-}
-
 /* Sort bmap intents by inode. */
 static int
 xfs_bmap_update_diff_items(
@@ -314,9 +269,6 @@ xfs_bmap_update_log_item(
 	uint				next_extent;
 	struct xfs_map_extent		*map;
 
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
-
 	/*
 	 * atomic_inc_return gives us the value after the increment;
 	 * we want to use it as an array index so we need to subtract 1 from
@@ -346,7 +298,6 @@ xfs_bmap_update_create_intent(
 
 	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
 
-	xfs_trans_add_item(tp, &buip->bui_item);
 	if (sort)
 		list_sort(mp, items, xfs_bmap_update_diff_items);
 	list_for_each_entry(bi, items, bi_list)
@@ -354,14 +305,23 @@ xfs_bmap_update_create_intent(
 	return &buip->bui_item;
 }
 
-/* Get an BUD so we can process all the deferred rmap updates. */
+/* Get an BUD so we can process all the deferred bmap updates. */
 static struct xfs_log_item *
 xfs_bmap_update_create_done(
 	struct xfs_trans		*tp,
 	struct xfs_log_item		*intent,
 	unsigned int			count)
 {
-	return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item;
+	struct xfs_bui_log_item		*buip = BUI_ITEM(intent);
+	struct xfs_bud_log_item		*budp;
+
+	budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
+	xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
+			  &xfs_bud_item_ops);
+	budp->bud_buip = buip;
+	budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+
+	return &budp->bud_item;
 }
 
 /* Take a passive ref to the AG containing the space we're mapping. */
@@ -392,7 +352,7 @@ xfs_bmap_update_put_group(
 	xfs_perag_intent_put(bi->bi_pag);
 }
 
-/* Process a deferred rmap update. */
+/* Process a deferred bmap update. */
 STATIC int
 xfs_bmap_update_finish_item(
 	struct xfs_trans		*tp,
@@ -405,7 +365,7 @@ xfs_bmap_update_finish_item(
 
 	bi = container_of(item, struct xfs_bmap_intent, bi_list);
 
-	error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi);
+	error = xfs_bmap_finish_one(tp, bi);
 	if (!error && bi->bi_bmap.br_blockcount > 0) {
 		ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
 		return -EAGAIN;
@@ -437,15 +397,6 @@ xfs_bmap_update_cancel_item(
 	kmem_cache_free(xfs_bmap_intent_cache, bi);
 }
 
-const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
-	.max_items	= XFS_BUI_MAX_FAST_EXTENTS,
-	.create_intent	= xfs_bmap_update_create_intent,
-	.abort_intent	= xfs_bmap_update_abort_intent,
-	.create_done	= xfs_bmap_update_create_done,
-	.finish_item	= xfs_bmap_update_finish_item,
-	.cancel_item	= xfs_bmap_update_cancel_item,
-};
-
 /* Is this recovered BUI ok? */
 static inline bool
 xfs_bui_validate(
@@ -480,23 +431,53 @@ xfs_bui_validate(
 	return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
 }
 
+static inline struct xfs_bmap_intent *
+xfs_bui_recover_work(
+	struct xfs_mount		*mp,
+	struct xfs_defer_pending	*dfp,
+	struct xfs_inode		**ipp,
+	struct xfs_map_extent		*map)
+{
+	struct xfs_bmap_intent		*bi;
+	int				error;
+
+	error = xlog_recover_iget(mp, map->me_owner, ipp);
+	if (error)
+		return ERR_PTR(error);
+
+	bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+	bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+			XFS_ATTR_FORK : XFS_DATA_FORK;
+	bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+	bi->bi_bmap.br_startblock = map->me_startblock;
+	bi->bi_bmap.br_startoff = map->me_startoff;
+	bi->bi_bmap.br_blockcount = map->me_len;
+	bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+	bi->bi_owner = *ipp;
+	xfs_bmap_update_get_group(mp, bi);
+
+	xfs_defer_add_item(dfp, &bi->bi_list);
+	return bi;
+}
+
 /*
  * Process a bmap update intent item that was recovered from the log.
  * We need to update some inode's bmbt.
  */
 STATIC int
-xfs_bui_item_recover(
-	struct xfs_log_item		*lip,
+xfs_bmap_recover_work(
+	struct xfs_defer_pending	*dfp,
 	struct list_head		*capture_list)
 {
-	struct xfs_bmap_intent		fake = { };
 	struct xfs_trans_res		resv;
+	struct xfs_log_item		*lip = dfp->dfp_intent;
 	struct xfs_bui_log_item		*buip = BUI_ITEM(lip);
 	struct xfs_trans		*tp;
 	struct xfs_inode		*ip = NULL;
 	struct xfs_mount		*mp = lip->li_log->l_mp;
 	struct xfs_map_extent		*map;
-	struct xfs_bud_log_item		*budp;
+	struct xfs_bmap_intent		*work;
 	int				iext_delta;
 	int				error = 0;
 
@@ -507,13 +488,9 @@ xfs_bui_item_recover(
 	}
 
 	map = &buip->bui_format.bui_extents[0];
-	fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
-			XFS_ATTR_FORK : XFS_DATA_FORK;
-	fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
-
-	error = xlog_recover_iget(mp, map->me_owner, &ip);
-	if (error)
-		return error;
+	work = xfs_bui_recover_work(mp, dfp, &ip, map);
+	if (IS_ERR(work))
+		return PTR_ERR(work);
 
 	/* Allocate transaction and do the work. */
 	resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -522,42 +499,27 @@ xfs_bui_item_recover(
 	if (error)
 		goto err_rele;
 
-	budp = xfs_trans_get_bud(tp, buip);
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
 
-	if (fake.bi_type == XFS_BMAP_MAP)
+	if (work->bi_type == XFS_BMAP_MAP)
 		iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
 	else
 		iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
 
-	error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta);
+	error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta);
 	if (error == -EFBIG)
 		error = xfs_iext_count_upgrade(tp, ip, iext_delta);
 	if (error)
 		goto err_cancel;
 
-	fake.bi_owner = ip;
-	fake.bi_bmap.br_startblock = map->me_startblock;
-	fake.bi_bmap.br_startoff = map->me_startoff;
-	fake.bi_bmap.br_blockcount = map->me_len;
-	fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
-			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-
-	xfs_bmap_update_get_group(mp, &fake);
-	error = xfs_trans_log_finish_bmap_update(tp, budp, &fake);
+	error = xlog_recover_finish_intent(tp, dfp);
 	if (error == -EFSCORRUPTED)
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map,
-				sizeof(*map));
-	xfs_bmap_update_put_group(&fake);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				&buip->bui_format, sizeof(buip->bui_format));
 	if (error)
 		goto err_cancel;
 
-	if (fake.bi_bmap.br_blockcount > 0) {
-		ASSERT(fake.bi_type == XFS_BMAP_UNMAP);
-		xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap);
-	}
-
 	/*
 	 * Commit transaction, which frees the transaction and saves the inode
 	 * for later replay activities.
@@ -579,21 +541,13 @@ err_rele:
 	return error;
 }
 
-STATIC bool
-xfs_bui_item_match(
-	struct xfs_log_item	*lip,
-	uint64_t		intent_id)
-{
-	return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
-}
-
 /* Relog an intent item to push the log tail forward. */
 static struct xfs_log_item *
-xfs_bui_item_relog(
+xfs_bmap_relog_intent(
+	struct xfs_trans		*tp,
 	struct xfs_log_item		*intent,
-	struct xfs_trans		*tp)
+	struct xfs_log_item		*done_item)
 {
-	struct xfs_bud_log_item		*budp;
 	struct xfs_bui_log_item		*buip;
 	struct xfs_map_extent		*map;
 	unsigned int			count;
@@ -601,27 +555,40 @@ xfs_bui_item_relog(
 	count = BUI_ITEM(intent)->bui_format.bui_nextents;
 	map = BUI_ITEM(intent)->bui_format.bui_extents;
 
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
-	set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
 	buip = xfs_bui_init(tp->t_mountp);
 	memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map));
 	atomic_set(&buip->bui_next_extent, count);
-	xfs_trans_add_item(tp, &buip->bui_item);
-	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+
 	return &buip->bui_item;
 }
 
+const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+	.name		= "bmap",
+	.max_items	= XFS_BUI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_bmap_update_create_intent,
+	.abort_intent	= xfs_bmap_update_abort_intent,
+	.create_done	= xfs_bmap_update_create_done,
+	.finish_item	= xfs_bmap_update_finish_item,
+	.cancel_item	= xfs_bmap_update_cancel_item,
+	.recover_work	= xfs_bmap_recover_work,
+	.relog_intent	= xfs_bmap_relog_intent,
+};
+
+STATIC bool
+xfs_bui_item_match(
+	struct xfs_log_item	*lip,
+	uint64_t		intent_id)
+{
+	return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
+}
+
 static const struct xfs_item_ops xfs_bui_item_ops = {
 	.flags		= XFS_ITEM_INTENT,
 	.iop_size	= xfs_bui_item_size,
 	.iop_format	= xfs_bui_item_format,
 	.iop_unpin	= xfs_bui_item_unpin,
 	.iop_release	= xfs_bui_item_release,
-	.iop_recover	= xfs_bui_item_recover,
 	.iop_match	= xfs_bui_item_match,
-	.iop_relog	= xfs_bui_item_relog,
 };
 
 static inline void
@@ -681,12 +648,9 @@ xlog_recover_bui_commit_pass2(
 	buip = xfs_bui_init(mp);
 	xfs_bui_copy_format(&buip->bui_format, bui_formatp);
 	atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
-	/*
-	 * Insert the intent into the AIL directly and drop one reference so
-	 * that finishing or canceling the work will drop the other.
-	 */
-	xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn);
-	xfs_bui_release(buip);
+
+	xlog_recover_intent_item(log, &buip->bui_item, lsn,
+			&xfs_bmap_update_defer_type);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 731260a5af6d..c2531c28905c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -69,147 +69,6 @@ xfs_zero_extent(
 		GFP_NOFS, 0);
 }
 
-#ifdef CONFIG_XFS_RT
-int
-xfs_bmap_rtalloc(
-	struct xfs_bmalloca	*ap)
-{
-	struct xfs_mount	*mp = ap->ip->i_mount;
-	xfs_fileoff_t		orig_offset = ap->offset;
-	xfs_rtxnum_t		rtx;
-	xfs_rtxlen_t		prod = 0;  /* product factor for allocators */
-	xfs_extlen_t		mod = 0;   /* product factor for allocators */
-	xfs_rtxlen_t		ralen = 0; /* realtime allocation length */
-	xfs_extlen_t		align;     /* minimum allocation alignment */
-	xfs_extlen_t		orig_length = ap->length;
-	xfs_extlen_t		minlen = mp->m_sb.sb_rextsize;
-	xfs_rtxlen_t		raminlen;
-	bool			rtlocked = false;
-	bool			ignore_locality = false;
-	int			error;
-
-	align = xfs_get_extsz_hint(ap->ip);
-retry:
-	prod = xfs_extlen_to_rtxlen(mp, align);
-	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
-					align, 1, ap->eof, 0,
-					ap->conv, &ap->offset, &ap->length);
-	if (error)
-		return error;
-	ASSERT(ap->length);
-	ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0);
-
-	/*
-	 * If we shifted the file offset downward to satisfy an extent size
-	 * hint, increase minlen by that amount so that the allocator won't
-	 * give us an allocation that's too short to cover at least one of the
-	 * blocks that the caller asked for.
-	 */
-	if (ap->offset != orig_offset)
-		minlen += orig_offset - ap->offset;
-
-	/*
-	 * If the offset & length are not perfectly aligned
-	 * then kill prod, it will just get us in trouble.
-	 */
-	div_u64_rem(ap->offset, align, &mod);
-	if (mod || ap->length % align)
-		prod = 1;
-	/*
-	 * Set ralen to be the actual requested length in rtextents.
-	 *
-	 * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
-	 * we rounded up to it, cut it back so it's valid again.
-	 * Note that if it's a really large request (bigger than
-	 * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
-	 * adjust the starting point to match it.
-	 */
-	ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
-
-	/*
-	 * Lock out modifications to both the RT bitmap and summary inodes
-	 */
-	if (!rtlocked) {
-		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
-		xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
-		xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
-		rtlocked = true;
-	}
-
-	/*
-	 * If it's an allocation to an empty file at offset 0,
-	 * pick an extent that will space things out in the rt area.
-	 */
-	if (ap->eof && ap->offset == 0) {
-		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
-		if (error)
-			return error;
-		ap->blkno = xfs_rtx_to_rtb(mp, rtx);
-	} else {
-		ap->blkno = 0;
-	}
-
-	xfs_bmap_adjacent(ap);
-
-	/*
-	 * Realtime allocation, done through xfs_rtallocate_extent.
-	 */
-	if (ignore_locality)
-		rtx = 0;
-	else
-		rtx = xfs_rtb_to_rtx(mp, ap->blkno);
-	raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
-	error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen,
-			ap->wasdel, prod, &rtx);
-	if (error)
-		return error;
-
-	if (rtx != NULLRTEXTNO) {
-		ap->blkno = xfs_rtx_to_rtb(mp, rtx);
-		ap->length = xfs_rtxlen_to_extlen(mp, ralen);
-		ap->ip->i_nblocks += ap->length;
-		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
-		if (ap->wasdel)
-			ap->ip->i_delayed_blks -= ap->length;
-		/*
-		 * Adjust the disk quota also. This was reserved
-		 * earlier.
-		 */
-		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
-			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
-					XFS_TRANS_DQ_RTBCOUNT, ap->length);
-		return 0;
-	}
-
-	if (align > mp->m_sb.sb_rextsize) {
-		/*
-		 * We previously enlarged the request length to try to satisfy
-		 * an extent size hint.  The allocator didn't return anything,
-		 * so reset the parameters to the original values and try again
-		 * without alignment criteria.
-		 */
-		ap->offset = orig_offset;
-		ap->length = orig_length;
-		minlen = align = mp->m_sb.sb_rextsize;
-		goto retry;
-	}
-
-	if (!ignore_locality && ap->blkno != 0) {
-		/*
-		 * If we can't allocate near a specific rt extent, try again
-		 * without locality criteria.
-		 */
-		ignore_locality = true;
-		goto retry;
-	}
-
-	ap->blkno = NULLFSBLOCK;
-	ap->length = 0;
-	return 0;
-}
-#endif /* CONFIG_XFS_RT */
-
 /*
  * Extent tree block counting routines.
  */
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 6888078f5c31..77ecbb753ef2 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -47,7 +47,7 @@ int	xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
 			       struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
 			       int rt, int eof, int delay, int convert,
 			       xfs_fileoff_t *offp, xfs_extlen_t *lenp);
-void	xfs_bmap_adjacent(struct xfs_bmalloca *ap);
+bool	xfs_bmap_adjacent(struct xfs_bmalloca *ap);
 int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 			     int whichfork, struct xfs_bmbt_irec *rec,
 			     int *is_empty);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 669332849680..8e5bd50d29fe 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2049,6 +2049,14 @@ error_free:
 	return NULL;
 }
 
+static inline void
+xfs_buf_list_del(
+	struct xfs_buf		*bp)
+{
+	list_del_init(&bp->b_list);
+	wake_up_var(&bp->b_list);
+}
+
 /*
  * Cancel a delayed write list.
  *
@@ -2066,7 +2074,7 @@ xfs_buf_delwri_cancel(
 
 		xfs_buf_lock(bp);
 		bp->b_flags &= ~_XBF_DELWRI_Q;
-		list_del_init(&bp->b_list);
+		xfs_buf_list_del(bp);
 		xfs_buf_relse(bp);
 	}
 }
@@ -2120,6 +2128,34 @@ xfs_buf_delwri_queue(
 }
 
 /*
+ * Queue a buffer to this delwri list as part of a data integrity operation.
+ * If the buffer is on any other delwri list, we'll wait for that to clear
+ * so that the caller can submit the buffer for IO and wait for the result.
+ * Callers must ensure the buffer is not already on the list.
+ */
+void
+xfs_buf_delwri_queue_here(
+	struct xfs_buf		*bp,
+	struct list_head	*buffer_list)
+{
+	/*
+	 * We need this buffer to end up on the /caller's/ delwri list, not any
+	 * old list.  This can happen if the buffer is marked stale (which
+	 * clears DELWRI_Q) after the AIL queues the buffer to its list but
+	 * before the AIL has a chance to submit the list.
+	 */
+	while (!list_empty(&bp->b_list)) {
+		xfs_buf_unlock(bp);
+		wait_var_event(&bp->b_list, list_empty(&bp->b_list));
+		xfs_buf_lock(bp);
+	}
+
+	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+
+	xfs_buf_delwri_queue(bp, buffer_list);
+}
+
+/*
  * Compare function is more complex than it needs to be because
  * the return value is only 32 bits and we are doing comparisons
  * on 64 bit values
@@ -2181,7 +2217,7 @@ xfs_buf_delwri_submit_buffers(
 		 * reference and remove it from the list here.
 		 */
 		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
-			list_del_init(&bp->b_list);
+			xfs_buf_list_del(bp);
 			xfs_buf_relse(bp);
 			continue;
 		}
@@ -2201,7 +2237,7 @@ xfs_buf_delwri_submit_buffers(
 			list_move_tail(&bp->b_list, wait_list);
 		} else {
 			bp->b_flags |= XBF_ASYNC;
-			list_del_init(&bp->b_list);
+			xfs_buf_list_del(bp);
 		}
 		__xfs_buf_submit(bp, false);
 	}
@@ -2255,7 +2291,7 @@ xfs_buf_delwri_submit(
 	while (!list_empty(&wait_list)) {
 		bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
 
-		list_del_init(&bp->b_list);
+		xfs_buf_list_del(bp);
 
 		/*
 		 * Wait on the locked buffer, check for errors and unlock and
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c86e16419656..b470de08a46c 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -319,6 +319,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp);
 /* Delayed Write Buffer Routines */
 extern void xfs_buf_delwri_cancel(struct list_head *);
 extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
 extern int xfs_buf_delwri_submit(struct list_head *);
 extern int xfs_buf_delwri_submit_nowait(struct list_head *);
 extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 9f3ceb461515..cc6dc56f455d 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -18,6 +18,7 @@
 #include "xfs_bmap.h"
 #include "xfs_trans.h"
 #include "xfs_error.h"
+#include "xfs_health.h"
 
 /*
  * Directory file type support functions
@@ -51,7 +52,7 @@ xfs_dir2_sf_getdents(
 	struct xfs_mount	*mp = dp->i_mount;
 	xfs_dir2_dataptr_t	off;		/* current entry's offset */
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
-	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
+	struct xfs_dir2_sf_hdr	*sfp = dp->i_df.if_data;
 	xfs_dir2_dataptr_t	dot_offset;
 	xfs_dir2_dataptr_t	dotdot_offset;
 	xfs_ino_t		ino;
@@ -59,9 +60,7 @@ xfs_dir2_sf_getdents(
 
 	ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
 	ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-
-	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	ASSERT(sfp != NULL);
 
 	/*
 	 * If the block number in the offset is out of range, we're done.
@@ -519,6 +518,8 @@ xfs_readdir(
 
 	if (xfs_is_shutdown(dp->i_mount))
 		return -EIO;
+	if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+		return -EIO;
 
 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 61a45a86ffe8..b4f20d9c8f98 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -172,14 +172,14 @@ xfs_qm_adjust_dqtimers(
 /*
  * initialize a buffer full of dquots and log the whole thing
  */
-STATIC void
+void
 xfs_qm_init_dquot_blk(
 	struct xfs_trans	*tp,
-	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
 	xfs_dqtype_t		type,
 	struct xfs_buf		*bp)
 {
+	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_quotainfo	*q = mp->m_quotainfo;
 	struct xfs_dqblk	*d;
 	xfs_dqid_t		curid;
@@ -353,7 +353,7 @@ xfs_dquot_disk_alloc(
 	 * Make a chunk of dquots out of this buffer and log
 	 * the entire thing.
 	 */
-	xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp);
+	xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp);
 	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
 
 	/*
@@ -1362,34 +1362,3 @@ xfs_qm_exit(void)
 	kmem_cache_destroy(xfs_dqtrx_cache);
 	kmem_cache_destroy(xfs_dquot_cache);
 }
-
-/*
- * Iterate every dquot of a particular type.  The caller must ensure that the
- * particular quota type is active.  iter_fn can return negative error codes,
- * or -ECANCELED to indicate that it wants to stop iterating.
- */
-int
-xfs_qm_dqiterate(
-	struct xfs_mount	*mp,
-	xfs_dqtype_t		type,
-	xfs_qm_dqiterate_fn	iter_fn,
-	void			*priv)
-{
-	struct xfs_dquot	*dq;
-	xfs_dqid_t		id = 0;
-	int			error;
-
-	do {
-		error = xfs_qm_dqget_next(mp, id, type, &dq);
-		if (error == -ENOENT)
-			return 0;
-		if (error)
-			return error;
-
-		error = iter_fn(dq, type, priv);
-		id = dq->q_id + 1;
-		xfs_qm_dqput(dq);
-	} while (error == 0 && id != 0);
-
-	return error;
-}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 80c8f851a2f3..956272d9b302 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -234,12 +234,10 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
 	return dqp;
 }
 
-typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq,
-		xfs_dqtype_t type, void *priv);
-int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type,
-		xfs_qm_dqiterate_fn iter_fn, void *priv);
-
 time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout);
 time64_t xfs_dquot_set_grace_period(time64_t grace);
 
+void xfs_qm_init_dquot_blk(struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t
+		type, struct xfs_buf *bp);
+
 #endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 9ecfdcdc752f..2ccde32c9a9e 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -678,3 +678,16 @@ xfs_extent_busy_ag_cmp(
 		diff = b1->bno - b2->bno;
 	return diff;
 }
+
+/* Are there any busy extents in this AG? */
+bool
+xfs_extent_busy_list_empty(
+	struct xfs_perag	*pag)
+{
+	bool			res;
+
+	spin_lock(&pag->pagb_lock);
+	res = RB_EMPTY_ROOT(&pag->pagb_tree);
+	spin_unlock(&pag->pagb_lock);
+	return res;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 0639aab336f3..470032de3139 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -85,4 +85,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
 	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
 }
 
+bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
+
 #endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3fa8789820ad..1d1185fca6a5 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -304,39 +304,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
 };
 
 /*
- * Allocate an "extent free done" log item that will hold nextents worth of
- * extents.  The caller must use all nextents extents, because we are not
- * flexible about this at all.
- */
-static struct xfs_efd_log_item *
-xfs_trans_get_efd(
-	struct xfs_trans		*tp,
-	struct xfs_efi_log_item		*efip,
-	unsigned int			nextents)
-{
-	struct xfs_efd_log_item		*efdp;
-
-	ASSERT(nextents > 0);
-
-	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
-		efdp = kzalloc(xfs_efd_log_item_sizeof(nextents),
-				GFP_KERNEL | __GFP_NOFAIL);
-	} else {
-		efdp = kmem_cache_zalloc(xfs_efd_cache,
-					GFP_KERNEL | __GFP_NOFAIL);
-	}
-
-	xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
-			  &xfs_efd_item_ops);
-	efdp->efd_efip = efip;
-	efdp->efd_format.efd_nextents = nextents;
-	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
-
-	xfs_trans_add_item(tp, &efdp->efd_item);
-	return efdp;
-}
-
-/*
  * Fill the EFD with all extents from the EFI when we need to roll the
  * transaction and continue with a new EFI.
  *
@@ -364,69 +331,6 @@ xfs_efd_from_efi(
 	efdp->efd_next_extent = efip->efi_format.efi_nextents;
 }
 
-/*
- * Free an extent and log it to the EFD. Note that the transaction is marked
- * dirty regardless of whether the extent free succeeds or fails to support the
- * EFI/EFD lifecycle rules.
- */
-static int
-xfs_trans_free_extent(
-	struct xfs_trans		*tp,
-	struct xfs_efd_log_item		*efdp,
-	struct xfs_extent_free_item	*xefi)
-{
-	struct xfs_owner_info		oinfo = { };
-	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_extent		*extp;
-	uint				next_extent;
-	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp,
-							xefi->xefi_startblock);
-	int				error;
-
-	oinfo.oi_owner = xefi->xefi_owner;
-	if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
-		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
-	if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
-		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
-
-	trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
-			agbno, xefi->xefi_blockcount);
-
-	error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
-			xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv,
-			xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
-
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the EFI and frees the EFD
-	 * 2.) shuts down the filesystem
-	 */
-	tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
-	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
-	/*
-	 * If we need a new transaction to make progress, the caller will log a
-	 * new EFI with the current contents. It will also log an EFD to cancel
-	 * the existing EFI, and so we need to copy all the unprocessed extents
-	 * in this EFI to the EFD so this works correctly.
-	 */
-	if (error == -EAGAIN) {
-		xfs_efd_from_efi(efdp);
-		return error;
-	}
-
-	next_extent = efdp->efd_next_extent;
-	ASSERT(next_extent < efdp->efd_format.efd_nextents);
-	extp = &(efdp->efd_format.efd_extents[next_extent]);
-	extp->ext_start = xefi->xefi_startblock;
-	extp->ext_len = xefi->xefi_blockcount;
-	efdp->efd_next_extent++;
-
-	return error;
-}
-
 /* Sort bmap items by AG. */
 static int
 xfs_extent_free_diff_items(
@@ -453,9 +357,6 @@ xfs_extent_free_log_item(
 	uint				next_extent;
 	struct xfs_extent		*extp;
 
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
-
 	/*
 	 * atomic_inc_return gives us the value after the increment;
 	 * we want to use it as an array index so we need to subtract 1 from
@@ -481,7 +382,6 @@ xfs_extent_free_create_intent(
 
 	ASSERT(count > 0);
 
-	xfs_trans_add_item(tp, &efip->efi_item);
 	if (sort)
 		list_sort(mp, items, xfs_extent_free_diff_items);
 	list_for_each_entry(xefi, items, xefi_list)
@@ -496,7 +396,26 @@ xfs_extent_free_create_done(
 	struct xfs_log_item		*intent,
 	unsigned int			count)
 {
-	return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item;
+	struct xfs_efi_log_item		*efip = EFI_ITEM(intent);
+	struct xfs_efd_log_item		*efdp;
+
+	ASSERT(count > 0);
+
+	if (count > XFS_EFD_MAX_FAST_EXTENTS) {
+		efdp = kzalloc(xfs_efd_log_item_sizeof(count),
+				GFP_KERNEL | __GFP_NOFAIL);
+	} else {
+		efdp = kmem_cache_zalloc(xfs_efd_cache,
+					GFP_KERNEL | __GFP_NOFAIL);
+	}
+
+	xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
+			  &xfs_efd_item_ops);
+	efdp->efd_efip = efip;
+	efdp->efd_format.efd_nextents = count;
+	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+	return &efdp->efd_item;
 }
 
 /* Take a passive ref to the AG containing the space we're freeing. */
@@ -527,19 +446,49 @@ xfs_extent_free_finish_item(
 	struct list_head		*item,
 	struct xfs_btree_cur		**state)
 {
+	struct xfs_owner_info		oinfo = { };
 	struct xfs_extent_free_item	*xefi;
-	int				error;
+	struct xfs_efd_log_item		*efdp = EFD_ITEM(done);
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_extent		*extp;
+	uint				next_extent;
+	xfs_agblock_t			agbno;
+	int				error = 0;
 
 	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+	agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
+
+	oinfo.oi_owner = xefi->xefi_owner;
+	if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
+		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+	if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
+		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
 
-	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
+	trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
+			agbno, xefi->xefi_blockcount);
 
 	/*
-	 * Don't free the XEFI if we need a new transaction to complete
-	 * processing of it.
+	 * If we need a new transaction to make progress, the caller will log a
+	 * new EFI with the current contents. It will also log an EFD to cancel
+	 * the existing EFI, and so we need to copy all the unprocessed extents
+	 * in this EFI to the EFD so this works correctly.
 	 */
-	if (error == -EAGAIN)
+	if (!(xefi->xefi_flags & XFS_EFI_CANCELLED))
+		error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
+				xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv,
+				xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
+	if (error == -EAGAIN) {
+		xfs_efd_from_efi(efdp);
 		return error;
+	}
+
+	/* Add the work we finished to the EFD, even though nobody uses that */
+	next_extent = efdp->efd_next_extent;
+	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+	extp = &(efdp->efd_format.efd_extents[next_extent]);
+	extp->ext_start = xefi->xefi_startblock;
+	extp->ext_len = xefi->xefi_blockcount;
+	efdp->efd_next_extent++;
 
 	xfs_extent_free_put_group(xefi);
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
@@ -567,15 +516,6 @@ xfs_extent_free_cancel_item(
 	kmem_cache_free(xfs_extfree_item_cache, xefi);
 }
 
-const struct xfs_defer_op_type xfs_extent_free_defer_type = {
-	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
-	.create_intent	= xfs_extent_free_create_intent,
-	.abort_intent	= xfs_extent_free_abort_intent,
-	.create_done	= xfs_extent_free_create_done,
-	.finish_item	= xfs_extent_free_finish_item,
-	.cancel_item	= xfs_extent_free_cancel_item,
-};
-
 /*
  * AGFL blocks are accounted differently in the reserve pools and are not
  * inserted into the busy extent list.
@@ -610,16 +550,6 @@ xfs_agfl_free_finish_item(
 		error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno,
 				agbno, agbp, &oinfo);
 
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the EFI and frees the EFD
-	 * 2.) shuts down the filesystem
-	 */
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
 	next_extent = efdp->efd_next_extent;
 	ASSERT(next_extent < efdp->efd_format.efd_nextents);
 	extp = &(efdp->efd_format.efd_extents[next_extent]);
@@ -632,16 +562,6 @@ xfs_agfl_free_finish_item(
 	return error;
 }
 
-/* sub-type with special handling for AGFL deferred frees */
-const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
-	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
-	.create_intent	= xfs_extent_free_create_intent,
-	.abort_intent	= xfs_extent_free_abort_intent,
-	.create_done	= xfs_extent_free_create_done,
-	.finish_item	= xfs_agfl_free_finish_item,
-	.cancel_item	= xfs_extent_free_cancel_item,
-};
-
 /* Is this recovered EFI ok? */
 static inline bool
 xfs_efi_validate_ext(
@@ -651,23 +571,41 @@ xfs_efi_validate_ext(
 	return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len);
 }
 
+static inline void
+xfs_efi_recover_work(
+	struct xfs_mount		*mp,
+	struct xfs_defer_pending	*dfp,
+	struct xfs_extent		*extp)
+{
+	struct xfs_extent_free_item	*xefi;
+
+	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
+			       GFP_KERNEL | __GFP_NOFAIL);
+	xefi->xefi_startblock = extp->ext_start;
+	xefi->xefi_blockcount = extp->ext_len;
+	xefi->xefi_agresv = XFS_AG_RESV_NONE;
+	xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN;
+	xfs_extent_free_get_group(mp, xefi);
+
+	xfs_defer_add_item(dfp, &xefi->xefi_list);
+}
+
 /*
  * Process an extent free intent item that was recovered from
  * the log.  We need to free the extents that it describes.
  */
 STATIC int
-xfs_efi_item_recover(
-	struct xfs_log_item		*lip,
+xfs_extent_free_recover_work(
+	struct xfs_defer_pending	*dfp,
 	struct list_head		*capture_list)
 {
 	struct xfs_trans_res		resv;
+	struct xfs_log_item		*lip = dfp->dfp_intent;
 	struct xfs_efi_log_item		*efip = EFI_ITEM(lip);
 	struct xfs_mount		*mp = lip->li_log->l_mp;
-	struct xfs_efd_log_item		*efdp;
 	struct xfs_trans		*tp;
 	int				i;
 	int				error = 0;
-	bool				requeue_only = false;
 
 	/*
 	 * First check the validity of the extents described by the
@@ -682,55 +620,22 @@ xfs_efi_item_recover(
 					sizeof(efip->efi_format));
 			return -EFSCORRUPTED;
 		}
+
+		xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]);
 	}
 
 	resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
 	error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp);
 	if (error)
 		return error;
-	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
-
-	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
-		struct xfs_extent_free_item	fake = {
-			.xefi_owner		= XFS_RMAP_OWN_UNKNOWN,
-			.xefi_agresv		= XFS_AG_RESV_NONE,
-		};
-		struct xfs_extent		*extp;
 
-		extp = &efip->efi_format.efi_extents[i];
-
-		fake.xefi_startblock = extp->ext_start;
-		fake.xefi_blockcount = extp->ext_len;
-
-		if (!requeue_only) {
-			xfs_extent_free_get_group(mp, &fake);
-			error = xfs_trans_free_extent(tp, efdp, &fake);
-			xfs_extent_free_put_group(&fake);
-		}
-
-		/*
-		 * If we can't free the extent without potentially deadlocking,
-		 * requeue the rest of the extents to a new so that they get
-		 * run again later with a new transaction context.
-		 */
-		if (error == -EAGAIN || requeue_only) {
-			error = xfs_free_extent_later(tp, fake.xefi_startblock,
-					fake.xefi_blockcount,
-					&XFS_RMAP_OINFO_ANY_OWNER,
-					fake.xefi_agresv);
-			if (!error) {
-				requeue_only = true;
-				continue;
-			}
-		}
-
-		if (error == -EFSCORRUPTED)
-			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
-					extp, sizeof(*extp));
-		if (error)
-			goto abort_error;
-
-	}
+	error = xlog_recover_finish_intent(tp, dfp);
+	if (error == -EFSCORRUPTED)
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				&efip->efi_format,
+				sizeof(efip->efi_format));
+	if (error)
+		goto abort_error;
 
 	return xfs_defer_ops_capture_and_commit(tp, capture_list);
 
@@ -739,21 +644,14 @@ abort_error:
 	return error;
 }
 
-STATIC bool
-xfs_efi_item_match(
-	struct xfs_log_item	*lip,
-	uint64_t		intent_id)
-{
-	return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
-}
-
 /* Relog an intent item to push the log tail forward. */
 static struct xfs_log_item *
-xfs_efi_item_relog(
+xfs_extent_free_relog_intent(
+	struct xfs_trans		*tp,
 	struct xfs_log_item		*intent,
-	struct xfs_trans		*tp)
+	struct xfs_log_item		*done_item)
 {
-	struct xfs_efd_log_item		*efdp;
+	struct xfs_efd_log_item		*efdp = EFD_ITEM(done_item);
 	struct xfs_efi_log_item		*efip;
 	struct xfs_extent		*extp;
 	unsigned int			count;
@@ -761,29 +659,56 @@ xfs_efi_item_relog(
 	count = EFI_ITEM(intent)->efi_format.efi_nextents;
 	extp = EFI_ITEM(intent)->efi_format.efi_extents;
 
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
 	efdp->efd_next_extent = count;
 	memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
-	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
 
 	efip = xfs_efi_init(tp->t_mountp, count);
 	memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
 	atomic_set(&efip->efi_next_extent, count);
-	xfs_trans_add_item(tp, &efip->efi_item);
-	set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+
 	return &efip->efi_item;
 }
 
+const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+	.name		= "extent_free",
+	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_extent_free_create_intent,
+	.abort_intent	= xfs_extent_free_abort_intent,
+	.create_done	= xfs_extent_free_create_done,
+	.finish_item	= xfs_extent_free_finish_item,
+	.cancel_item	= xfs_extent_free_cancel_item,
+	.recover_work	= xfs_extent_free_recover_work,
+	.relog_intent	= xfs_extent_free_relog_intent,
+};
+
+/* sub-type with special handling for AGFL deferred frees */
+const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+	.name		= "agfl_free",
+	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_extent_free_create_intent,
+	.abort_intent	= xfs_extent_free_abort_intent,
+	.create_done	= xfs_extent_free_create_done,
+	.finish_item	= xfs_agfl_free_finish_item,
+	.cancel_item	= xfs_extent_free_cancel_item,
+	.recover_work	= xfs_extent_free_recover_work,
+	.relog_intent	= xfs_extent_free_relog_intent,
+};
+
+STATIC bool
+xfs_efi_item_match(
+	struct xfs_log_item	*lip,
+	uint64_t		intent_id)
+{
+	return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
+}
+
 static const struct xfs_item_ops xfs_efi_item_ops = {
 	.flags		= XFS_ITEM_INTENT,
 	.iop_size	= xfs_efi_item_size,
 	.iop_format	= xfs_efi_item_format,
 	.iop_unpin	= xfs_efi_item_unpin,
 	.iop_release	= xfs_efi_item_release,
-	.iop_recover	= xfs_efi_item_recover,
 	.iop_match	= xfs_efi_item_match,
-	.iop_relog	= xfs_efi_item_relog,
 };
 
 /*
@@ -820,12 +745,9 @@ xlog_recover_efi_commit_pass2(
 		return error;
 	}
 	atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
-	/*
-	 * Insert the intent into the AIL directly and drop one reference so
-	 * that finishing or canceling the work will drop the other.
-	 */
-	xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn);
-	xfs_efi_release(efip);
+
+	xlog_recover_intent_item(log, &efip->efi_item, lsn,
+			&xfs_extent_free_defer_type);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 57076a25f17d..83f708f62ed9 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -134,6 +134,10 @@ xfs_growfs_data_private(
 	if (delta < 0 && nagcount < 2)
 		return -EINVAL;
 
+	/* No work to do */
+	if (delta == 0)
+		return 0;
+
 	oagcount = mp->m_sb.sb_agcount;
 	/* allocate the new per-ag structures */
 	if (nagcount > oagcount) {
@@ -153,7 +157,7 @@ xfs_growfs_data_private(
 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0,
 				0, &tp);
 	if (error)
-		return error;
+		goto out_free_unused_perag;
 
 	last_pag = xfs_perag_get(mp, oagcount - 1);
 	if (delta > 0) {
@@ -227,6 +231,9 @@ xfs_growfs_data_private(
 
 out_trans_cancel:
 	xfs_trans_cancel(tp);
+out_free_unused_perag:
+	if (nagcount > oagcount)
+		xfs_free_unused_perag_range(mp, oagcount, nagcount);
 	return error;
 }
 
@@ -344,59 +351,20 @@ xfs_growfs_log(
 }
 
 /*
- * exported through ioctl XFS_IOC_FSCOUNTS
- */
-
-void
-xfs_fs_counts(
-	xfs_mount_t		*mp,
-	xfs_fsop_counts_t	*cnt)
-{
-	cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
-	cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
-	cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
-						xfs_fdblocks_unavailable(mp);
-	cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
-}
-
-/*
- * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
- *
- * xfs_reserve_blocks is called to set m_resblks
- * in the in-core mount table. The number of unused reserved blocks
- * is kept in m_resblks_avail.
- *
  * Reserve the requested number of blocks if available. Otherwise return
  * as many as possible to satisfy the request. The actual number
- * reserved are returned in outval
- *
- * A null inval pointer indicates that only the current reserved blocks
- * available  should  be returned no settings are changed.
+ * reserved are returned in outval.
  */
-
 int
 xfs_reserve_blocks(
-	xfs_mount_t             *mp,
-	uint64_t              *inval,
-	xfs_fsop_resblks_t      *outval)
+	struct xfs_mount	*mp,
+	uint64_t		request)
 {
 	int64_t			lcounter, delta;
 	int64_t			fdblks_delta = 0;
-	uint64_t		request;
 	int64_t			free;
 	int			error = 0;
 
-	/* If inval is null, report current values and return */
-	if (inval == (uint64_t *)NULL) {
-		if (!outval)
-			return -EINVAL;
-		outval->resblks = mp->m_resblks;
-		outval->resblks_avail = mp->m_resblks_avail;
-		return 0;
-	}
-
-	request = *inval;
-
 	/*
 	 * With per-cpu counters, this becomes an interesting problem. we need
 	 * to work out if we are freeing or allocation blocks first, then we can
@@ -466,11 +434,6 @@ xfs_reserve_blocks(
 		spin_lock(&mp->m_sb_lock);
 	}
 out:
-	if (outval) {
-		outval->resblks = mp->m_resblks;
-		outval->resblks_avail = mp->m_resblks_avail;
-	}
-
 	spin_unlock(&mp->m_sb_lock);
 	return error;
 }
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 2cffe51a31e8..44457b0a0593 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -6,14 +6,12 @@
 #ifndef __XFS_FSOPS_H__
 #define	__XFS_FSOPS_H__
 
-extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
-extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
-extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
-extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
-				xfs_fsop_resblks_t *outval);
-extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags);
+int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
+int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
+int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
+int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
 
-extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
-extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
+int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
 
 #endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 9edc1f2bc939..f18fec0adf66 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -44,4 +44,16 @@ struct xfs_globals xfs_globals = {
 	.pwork_threads		=	-1,	/* automatic thread detection */
 	.larp			=	false,	/* log attribute replay */
 #endif
+
+	/*
+	 * Leave this many record slots empty when bulk loading btrees.  By
+	 * default we load new btree leaf blocks 75% full.
+	 */
+	.bload_leaf_slack	=	-1,
+
+	/*
+	 * Leave this many key/ptr slots empty when bulk loading btrees.  By
+	 * default we load new btree node blocks 75% full.
+	 */
+	.bload_node_slack	=	-1,
 };
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 72a075bb2c10..9a57afee9338 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -222,7 +222,7 @@ xfs_inode_mark_sick(
 	struct xfs_inode	*ip,
 	unsigned int		mask)
 {
-	ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+	ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
 	trace_xfs_inode_mark_sick(ip, mask);
 
 	spin_lock(&ip->i_flags_lock);
@@ -246,7 +246,7 @@ xfs_inode_mark_healthy(
 	struct xfs_inode	*ip,
 	unsigned int		mask)
 {
-	ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+	ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
 	trace_xfs_inode_mark_healthy(ip, mask);
 
 	spin_lock(&ip->i_flags_lock);
@@ -369,6 +369,10 @@ static const struct ioctl_sick_map ino_map[] = {
 	{ XFS_SICK_INO_XATTR,	XFS_BS_SICK_XATTR },
 	{ XFS_SICK_INO_SYMLINK,	XFS_BS_SICK_SYMLINK },
 	{ XFS_SICK_INO_PARENT,	XFS_BS_SICK_PARENT },
+	{ XFS_SICK_INO_BMBTD_ZAPPED,	XFS_BS_SICK_BMBTD },
+	{ XFS_SICK_INO_BMBTA_ZAPPED,	XFS_BS_SICK_BMBTA },
+	{ XFS_SICK_INO_DIR_ZAPPED,	XFS_BS_SICK_DIR },
+	{ XFS_SICK_INO_SYMLINK_ZAPPED,	XFS_BS_SICK_SYMLINK },
 	{ 0, 0 },
 };
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c0f1c89786c2..1fd94958aa97 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -37,15 +37,10 @@
 #include "xfs_reflink.h"
 #include "xfs_ag.h"
 #include "xfs_log_priv.h"
+#include "xfs_health.h"
 
 struct kmem_cache *xfs_inode_cache;
 
-/*
- * Used in xfs_itruncate_extents().  This is the maximum number of extents
- * freed from a file in a single transaction.
- */
-#define	XFS_ITRUNC_MAX_EXTENTS	2
-
 STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
 STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
 	struct xfs_inode *);
@@ -661,6 +656,8 @@ xfs_lookup(
 
 	if (xfs_is_shutdown(dp->i_mount))
 		return -EIO;
+	if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+		return -EIO;
 
 	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 	if (error)
@@ -875,7 +872,7 @@ xfs_init_new_inode(
 	case S_IFLNK:
 		ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
 		ip->i_df.if_bytes = 0;
-		ip->i_df.if_u1.if_root = NULL;
+		ip->i_df.if_data = NULL;
 		break;
 	default:
 		ASSERT(0);
@@ -978,6 +975,8 @@ xfs_create(
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
+	if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+		return -EIO;
 
 	prid = xfs_get_initial_prid(dp);
 
@@ -1217,6 +1216,8 @@ xfs_link(
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
+	if (xfs_ifork_zapped(tdp, XFS_DATA_FORK))
+		return -EIO;
 
 	error = xfs_qm_dqattach(sip);
 	if (error)
@@ -1339,7 +1340,6 @@ xfs_itruncate_extents_flags(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp = *tpp;
 	xfs_fileoff_t		first_unmap_block;
-	xfs_filblks_t		unmap_len;
 	int			error = 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1371,19 +1371,10 @@ xfs_itruncate_extents_flags(
 		return 0;
 	}
 
-	unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
-	while (unmap_len > 0) {
-		ASSERT(tp->t_highest_agno == NULLAGNUMBER);
-		error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
-				flags, XFS_ITRUNC_MAX_EXTENTS);
-		if (error)
-			goto out;
-
-		/* free the just unmapped extents */
-		error = xfs_defer_finish(&tp);
-		if (error)
-			goto out;
-	}
+	error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block,
+			XFS_MAX_FILEOFF);
+	if (error)
+		goto out;
 
 	if (whichfork == XFS_DATA_FORK) {
 		/* Remove all pending CoW reservations. */
@@ -2387,8 +2378,8 @@ xfs_ifree(
 	 * already been freed by xfs_attr_inactive.
 	 */
 	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
-		kmem_free(ip->i_df.if_u1.if_data);
-		ip->i_df.if_u1.if_data = NULL;
+		kmem_free(ip->i_df.if_data);
+		ip->i_df.if_data = NULL;
 		ip->i_df.if_bytes = 0;
 	}
 
@@ -2506,6 +2497,8 @@ xfs_remove(
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
+	if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+		return -EIO;
 
 	error = xfs_qm_dqattach(dp);
 	if (error)
@@ -3758,3 +3751,29 @@ xfs_inode_reload_unlinked(
 
 	return error;
 }
+
+/* Has this inode fork been zapped by repair? */
+bool
+xfs_ifork_zapped(
+	const struct xfs_inode	*ip,
+	int			whichfork)
+{
+	unsigned int		datamask = 0;
+
+	switch (whichfork) {
+	case XFS_DATA_FORK:
+		switch (ip->i_vnode.i_mode & S_IFMT) {
+		case S_IFDIR:
+			datamask = XFS_SICK_INO_DIR_ZAPPED;
+			break;
+		case S_IFLNK:
+			datamask = XFS_SICK_INO_SYMLINK_ZAPPED;
+			break;
+		}
+		return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask);
+	case XFS_ATTR_FORK:
+		return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED;
+	default:
+		return false;
+	}
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 3beb470f1892..97f63bacd4c2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -622,4 +622,6 @@ xfs_inode_unlinked_incomplete(
 int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
 int xfs_inode_reload_unlinked(struct xfs_inode *ip);
 
+bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork);
+
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cd7803fda8b1..0aee97ba0be8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -352,11 +352,10 @@ xfs_inode_item_format_data_fork(
 			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
 		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
 		    ip->i_df.if_bytes > 0) {
-			ASSERT(ip->i_df.if_u1.if_data != NULL);
+			ASSERT(ip->i_df.if_data != NULL);
 			ASSERT(ip->i_disk_size > 0);
 			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
-					ip->i_df.if_u1.if_data,
-					ip->i_df.if_bytes);
+					ip->i_df.if_data, ip->i_df.if_bytes);
 			ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
 			ilf->ilf_size++;
 		} else {
@@ -431,10 +430,9 @@ xfs_inode_item_format_attr_fork(
 
 		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
 		    ip->i_af.if_bytes > 0) {
-			ASSERT(ip->i_af.if_u1.if_data != NULL);
+			ASSERT(ip->i_af.if_data != NULL);
 			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
-					ip->i_af.if_u1.if_data,
-					ip->i_af.if_bytes);
+					ip->i_af.if_data, ip->i_af.if_bytes);
 			ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
 			ilf->ilf_size++;
 		} else {
@@ -557,6 +555,9 @@ xfs_inode_to_log_dinode(
 		memset(to->di_pad2, 0, sizeof(to->di_pad2));
 		uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
 		to->di_v3_pad = 0;
+
+		/* dummy value for initialisation */
+		to->di_crc = 0;
 	} else {
 		to->di_version = 2;
 		to->di_flushiter = ip->i_flushiter;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6c3919687ea6..f02b6e558af5 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1872,6 +1872,63 @@ xfs_fs_eofblocks_from_user(
 	return 0;
 }
 
+static int
+xfs_ioctl_getset_resblocks(
+	struct file		*filp,
+	unsigned int		cmd,
+	void __user		*arg)
+{
+	struct xfs_mount	*mp = XFS_I(file_inode(filp))->i_mount;
+	struct xfs_fsop_resblks	fsop = { };
+	int			error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (cmd == XFS_IOC_SET_RESBLKS) {
+		if (xfs_is_readonly(mp))
+			return -EROFS;
+
+		if (copy_from_user(&fsop, arg, sizeof(fsop)))
+			return -EFAULT;
+
+		error = mnt_want_write_file(filp);
+		if (error)
+			return error;
+		error = xfs_reserve_blocks(mp, fsop.resblks);
+		mnt_drop_write_file(filp);
+		if (error)
+			return error;
+	}
+
+	spin_lock(&mp->m_sb_lock);
+	fsop.resblks = mp->m_resblks;
+	fsop.resblks_avail = mp->m_resblks_avail;
+	spin_unlock(&mp->m_sb_lock);
+
+	if (copy_to_user(arg, &fsop, sizeof(fsop)))
+		return -EFAULT;
+	return 0;
+}
+
+static int
+xfs_ioctl_fs_counts(
+	struct xfs_mount	*mp,
+	struct xfs_fsop_counts __user	*uarg)
+{
+	struct xfs_fsop_counts	out = {
+		.allocino = percpu_counter_read_positive(&mp->m_icount),
+		.freeino  = percpu_counter_read_positive(&mp->m_ifree),
+		.freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
+				xfs_fdblocks_unavailable(mp),
+		.freertx  = percpu_counter_read_positive(&mp->m_frextents),
+	};
+
+	if (copy_to_user(uarg, &out, sizeof(out)))
+		return -EFAULT;
+	return 0;
+}
+
 /*
  * These long-unused ioctls were removed from the official ioctl API in 5.17,
  * but retain these definitions so that we can log warnings about them.
@@ -2008,60 +2065,12 @@ xfs_file_ioctl(
 		return error;
 	}
 
-	case XFS_IOC_FSCOUNTS: {
-		xfs_fsop_counts_t out;
+	case XFS_IOC_FSCOUNTS:
+		return xfs_ioctl_fs_counts(mp, arg);
 
-		xfs_fs_counts(mp, &out);
-
-		if (copy_to_user(arg, &out, sizeof(out)))
-			return -EFAULT;
-		return 0;
-	}
-
-	case XFS_IOC_SET_RESBLKS: {
-		xfs_fsop_resblks_t inout;
-		uint64_t	   in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (xfs_is_readonly(mp))
-			return -EROFS;
-
-		if (copy_from_user(&inout, arg, sizeof(inout)))
-			return -EFAULT;
-
-		error = mnt_want_write_file(filp);
-		if (error)
-			return error;
-
-		/* input parameter is passed in resblks field of structure */
-		in = inout.resblks;
-		error = xfs_reserve_blocks(mp, &in, &inout);
-		mnt_drop_write_file(filp);
-		if (error)
-			return error;
-
-		if (copy_to_user(arg, &inout, sizeof(inout)))
-			return -EFAULT;
-		return 0;
-	}
-
-	case XFS_IOC_GET_RESBLKS: {
-		xfs_fsop_resblks_t out;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		error = xfs_reserve_blocks(mp, NULL, &out);
-		if (error)
-			return error;
-
-		if (copy_to_user(arg, &out, sizeof(out)))
-			return -EFAULT;
-
-		return 0;
-	}
+	case XFS_IOC_SET_RESBLKS:
+	case XFS_IOC_GET_RESBLKS:
+		return xfs_ioctl_getset_resblocks(filp, cmd, arg);
 
 	case XFS_IOC_FSGROWFSDATA: {
 		struct xfs_growfs_data in;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ee206facf0dc..a1650fc81382 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1542,6 +1542,7 @@ xlog_alloc_log(
 	log->l_covered_state = XLOG_STATE_COVER_IDLE;
 	set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
 	INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
+	INIT_LIST_HEAD(&log->r_dfops);
 
 	log->l_prev_block  = -1;
 	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fa3ad1d7b31c..e30c06ec20e3 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -407,6 +407,7 @@ struct xlog {
 	long			l_opstate;	/* operational state */
 	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
 	struct list_head	*l_buf_cancel_table;
+	struct list_head	r_dfops;	/* recovered log intent items */
 	int			l_iclog_hsize;  /* size of iclog header */
 	int			l_iclog_heads;  /* # of iclog header sectors */
 	uint			l_sectBBsize;   /* sector size in BBs (2^n) */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a1e18b24971a..1251c81e55f9 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1723,30 +1723,24 @@ xlog_clear_stale_blocks(
  */
 void
 xlog_recover_release_intent(
-	struct xlog		*log,
-	unsigned short		intent_type,
-	uint64_t		intent_id)
+	struct xlog			*log,
+	unsigned short			intent_type,
+	uint64_t			intent_id)
 {
-	struct xfs_ail_cursor	cur;
-	struct xfs_log_item	*lip;
-	struct xfs_ail		*ailp = log->l_ailp;
+	struct xfs_defer_pending	*dfp, *n;
+
+	list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+		struct xfs_log_item	*lip = dfp->dfp_intent;
 
-	spin_lock(&ailp->ail_lock);
-	for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
-	     lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
 		if (lip->li_type != intent_type)
 			continue;
 		if (!lip->li_ops->iop_match(lip, intent_id))
 			continue;
 
-		spin_unlock(&ailp->ail_lock);
-		lip->li_ops->iop_release(lip);
-		spin_lock(&ailp->ail_lock);
-		break;
-	}
+		ASSERT(xlog_item_is_intent(lip));
 
-	xfs_trans_ail_cursor_done(&cur);
-	spin_unlock(&ailp->ail_lock);
+		xfs_defer_cancel_recovery(log->l_mp, dfp);
+	}
 }
 
 int
@@ -1939,6 +1933,29 @@ xlog_buf_readahead(
 		xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
 }
 
+/*
+ * Create a deferred work structure for resuming and tracking the progress of a
+ * log intent item that was found during recovery.
+ */
+void
+xlog_recover_intent_item(
+	struct xlog			*log,
+	struct xfs_log_item		*lip,
+	xfs_lsn_t			lsn,
+	const struct xfs_defer_op_type	*ops)
+{
+	ASSERT(xlog_item_is_intent(lip));
+
+	xfs_defer_start_recovery(lip, &log->r_dfops, ops);
+
+	/*
+	 * Insert the intent into the AIL directly and drop one reference so
+	 * that finishing or canceling the work will drop the other.
+	 */
+	xfs_trans_ail_insert(log->l_ailp, lip, lsn);
+	lip->li_ops->iop_unpin(lip, 0);
+}
+
 STATIC int
 xlog_recover_items_pass2(
 	struct xlog                     *log,
@@ -2533,36 +2550,26 @@ xlog_abort_defer_ops(
  */
 STATIC int
 xlog_recover_process_intents(
-	struct xlog		*log)
+	struct xlog			*log)
 {
 	LIST_HEAD(capture_list);
-	struct xfs_ail_cursor	cur;
-	struct xfs_log_item	*lip;
-	struct xfs_ail		*ailp;
-	int			error = 0;
+	struct xfs_defer_pending	*dfp, *n;
+	int				error = 0;
 #if defined(DEBUG) || defined(XFS_WARN)
-	xfs_lsn_t		last_lsn;
-#endif
+	xfs_lsn_t			last_lsn;
 
-	ailp = log->l_ailp;
-	spin_lock(&ailp->ail_lock);
-#if defined(DEBUG) || defined(XFS_WARN)
 	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
 #endif
-	for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
-	     lip != NULL;
-	     lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
-		const struct xfs_item_ops	*ops;
 
-		if (!xlog_item_is_intent(lip))
-			break;
+	list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+		ASSERT(xlog_item_is_intent(dfp->dfp_intent));
 
 		/*
 		 * We should never see a redo item with a LSN higher than
 		 * the last transaction we found in the log at the start
 		 * of recovery.
 		 */
-		ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
+		ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0);
 
 		/*
 		 * NOTE: If your intent processing routine can create more
@@ -2571,21 +2578,14 @@ xlog_recover_process_intents(
 		 * replayed in the wrong order!
 		 *
 		 * The recovery function can free the log item, so we must not
-		 * access lip after it returns.
+		 * access dfp->dfp_intent after it returns.  It must dispose of
+		 * @dfp if it returns 0.
 		 */
-		spin_unlock(&ailp->ail_lock);
-		ops = lip->li_ops;
-		error = ops->iop_recover(lip, &capture_list);
-		spin_lock(&ailp->ail_lock);
-		if (error) {
-			trace_xlog_intent_recovery_failed(log->l_mp, error,
-					ops->iop_recover);
+		error = xfs_defer_finish_recovery(log->l_mp, dfp,
+				&capture_list);
+		if (error)
 			break;
-		}
 	}
-
-	xfs_trans_ail_cursor_done(&cur);
-	spin_unlock(&ailp->ail_lock);
 	if (error)
 		goto err;
 
@@ -2606,27 +2606,34 @@ err:
  */
 STATIC void
 xlog_recover_cancel_intents(
-	struct xlog		*log)
+	struct xlog			*log)
 {
-	struct xfs_log_item	*lip;
-	struct xfs_ail_cursor	cur;
-	struct xfs_ail		*ailp;
-
-	ailp = log->l_ailp;
-	spin_lock(&ailp->ail_lock);
-	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
-	while (lip != NULL) {
-		if (!xlog_item_is_intent(lip))
-			break;
+	struct xfs_defer_pending	*dfp, *n;
 
-		spin_unlock(&ailp->ail_lock);
-		lip->li_ops->iop_release(lip);
-		spin_lock(&ailp->ail_lock);
-		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+		ASSERT(xlog_item_is_intent(dfp->dfp_intent));
+
+		xfs_defer_cancel_recovery(log->l_mp, dfp);
 	}
+}
 
-	xfs_trans_ail_cursor_done(&cur);
-	spin_unlock(&ailp->ail_lock);
+/*
+ * Transfer ownership of the recovered pending work to the recovery transaction
+ * and try to finish the work.  If there is more work to be done, the dfp will
+ * remain attached to the transaction.  If not, the dfp is freed.
+ */
+int
+xlog_recover_finish_intent(
+	struct xfs_trans		*tp,
+	struct xfs_defer_pending	*dfp)
+{
+	int				error;
+
+	list_move(&dfp->dfp_list, &tp->t_dfops);
+	error = xfs_defer_finish_one(tp, dfp);
+	if (error == -EAGAIN)
+		return 0;
+	return error;
 }
 
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aed5be5508fe..aabb25dc3efa 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -637,7 +637,6 @@ xfs_mountfs(
 	struct xfs_sb		*sbp = &(mp->m_sb);
 	struct xfs_inode	*rip;
 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
-	uint64_t		resblks;
 	uint			quotamount = 0;
 	uint			quotaflags = 0;
 	int			error = 0;
@@ -974,8 +973,7 @@ xfs_mountfs(
 	 * we were already there on the last unmount. Warn if this occurs.
 	 */
 	if (!xfs_is_readonly(mp)) {
-		resblks = xfs_default_resblks(mp);
-		error = xfs_reserve_blocks(mp, &resblks, NULL);
+		error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
 		if (error)
 			xfs_warn(mp,
 	"Unable to allocate reserve blocks. Continuing without reserve pool.");
@@ -1053,7 +1051,6 @@ void
 xfs_unmountfs(
 	struct xfs_mount	*mp)
 {
-	uint64_t		resblks;
 	int			error;
 
 	/*
@@ -1090,8 +1087,7 @@ xfs_unmountfs(
 	 * we only every apply deltas to the superblock and hence the incore
 	 * value does not matter....
 	 */
-	resblks = 0;
-	error = xfs_reserve_blocks(mp, &resblks, NULL);
+	error = xfs_reserve_blocks(mp, 0);
 	if (error)
 		xfs_warn(mp, "Unable to free reserved block pool. "
 				"Freespace may not be correct on next mount.");
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index a7daa522e00f..fa50e5308292 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -22,6 +22,7 @@
 
 #include <linux/mm.h>
 #include <linux/dax.h>
+#include <linux/fs.h>
 
 struct xfs_failure_info {
 	xfs_agblock_t		startblock;
@@ -73,10 +74,16 @@ xfs_dax_failure_fn(
 	struct xfs_mount		*mp = cur->bc_mp;
 	struct xfs_inode		*ip;
 	struct xfs_failure_info		*notify = data;
+	struct address_space		*mapping;
+	pgoff_t				pgoff;
+	unsigned long			pgcnt;
 	int				error = 0;
 
 	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
 	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+		/* Continue the query because this isn't a failure. */
+		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+			return 0;
 		notify->want_shutdown = true;
 		return 0;
 	}
@@ -92,15 +99,61 @@ xfs_dax_failure_fn(
 		return 0;
 	}
 
-	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
-				  xfs_failure_pgoff(mp, rec, notify),
-				  xfs_failure_pgcnt(mp, rec, notify),
-				  notify->mf_flags);
+	mapping = VFS_I(ip)->i_mapping;
+	pgoff = xfs_failure_pgoff(mp, rec, notify);
+	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
+
+	/* Continue the rmap query if the inode isn't a dax file. */
+	if (dax_mapping(mapping))
+		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
+					  notify->mf_flags);
+
+	/* Invalidate the cache in dax pages. */
+	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+		invalidate_inode_pages2_range(mapping, pgoff,
+					      pgoff + pgcnt - 1);
+
 	xfs_irele(ip);
 	return error;
 }
 
 static int
+xfs_dax_notify_failure_freeze(
+	struct xfs_mount	*mp)
+{
+	struct super_block	*sb = mp->m_super;
+	int			error;
+
+	error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
+	if (error)
+		xfs_emerg(mp, "already frozen by kernel, err=%d", error);
+
+	return error;
+}
+
+static void
+xfs_dax_notify_failure_thaw(
+	struct xfs_mount	*mp,
+	bool			kernel_frozen)
+{
+	struct super_block	*sb = mp->m_super;
+	int			error;
+
+	if (kernel_frozen) {
+		error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
+		if (error)
+			xfs_emerg(mp, "still frozen after notify failure, err=%d",
+				error);
+	}
+
+	/*
+	 * Also thaw userspace call anyway because the device is about to be
+	 * removed immediately.
+	 */
+	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+}
+
+static int
 xfs_dax_notify_ddev_failure(
 	struct xfs_mount	*mp,
 	xfs_daddr_t		daddr,
@@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure(
 	struct xfs_btree_cur	*cur = NULL;
 	struct xfs_buf		*agf_bp = NULL;
 	int			error = 0;
+	bool			kernel_frozen = false;
 	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, daddr);
 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
 	xfs_fsblock_t		end_fsbno = XFS_DADDR_TO_FSB(mp,
 							     daddr + bblen - 1);
 	xfs_agnumber_t		end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
 
+	if (mf_flags & MF_MEM_PRE_REMOVE) {
+		xfs_info(mp, "Device is about to be removed!");
+		/*
+		 * Freeze fs to prevent new mappings from being created.
+		 * - Keep going on if others already hold the kernel forzen.
+		 * - Keep going on if other errors too because this device is
+		 *   starting to fail.
+		 * - If kernel frozen state is hold successfully here, thaw it
+		 *   here as well at the end.
+		 */
+		kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
+	}
+
 	error = xfs_trans_alloc_empty(mp, &tp);
 	if (error)
-		return error;
+		goto out;
 
 	for (; agno <= end_agno; agno++) {
 		struct xfs_rmap_irec	ri_low = { };
@@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure(
 	}
 
 	xfs_trans_cancel(tp);
-	if (error || notify.want_shutdown) {
+
+	/*
+	 * Shutdown fs from a force umount in pre-remove case which won't fail,
+	 * so errors can be ignored.  Otherwise, shutdown the filesystem with
+	 * CORRUPT flag if error occured or notify.want_shutdown was set during
+	 * RMAP querying.
+	 */
+	if (mf_flags & MF_MEM_PRE_REMOVE)
+		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+	else if (error || notify.want_shutdown) {
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
 		if (!error)
 			error = -EFSCORRUPTED;
 	}
+
+out:
+	/* Thaw the fs if it has been frozen before. */
+	if (mf_flags & MF_MEM_PRE_REMOVE)
+		xfs_dax_notify_failure_thaw(mp, kernel_frozen);
+
 	return error;
 }
 
@@ -197,6 +279,14 @@ xfs_dax_notify_failure(
 
 	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
 	    mp->m_logdev_targp != mp->m_ddev_targp) {
+		/*
+		 * In the pre-remove case the failure notification is attempting
+		 * to trigger a force unmount.  The expectation is that the
+		 * device is still present, but its removal is in progress and
+		 * can not be cancelled, proceed with accessing the log device.
+		 */
+		if (mf_flags & MF_MEM_PRE_REMOVE)
+			return 0;
 		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
 		return -EFSCORRUPTED;
@@ -210,6 +300,12 @@ xfs_dax_notify_failure(
 	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
 	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
 
+	/* Notify failure on the whole device. */
+	if (offset == 0 && len == U64_MAX) {
+		offset = ddev_start;
+		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
+	}
+
 	/* Ignore the range out of filesystem area */
 	if (offset + len - 1 < ddev_start)
 		return -ENXIO;
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index dcc785fdd345..e0d56489f3b2 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -127,7 +127,10 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
 }
 #define xfs_trans_dup_dqinfo(tp, tp2)
 #define xfs_trans_free_dqinfo(tp)
-#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0)
+static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
+		struct xfs_inode *ip, uint field, int64_t delta)
+{
+}
 #define xfs_trans_apply_dquot_deltas(tp)
 #define xfs_trans_unreserve_and_mod_dquots(tp)
 static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 2d4444d61e98..20ad8086da60 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -227,52 +227,6 @@ static const struct xfs_item_ops xfs_cud_item_ops = {
 	.iop_intent	= xfs_cud_item_intent,
 };
 
-static struct xfs_cud_log_item *
-xfs_trans_get_cud(
-	struct xfs_trans		*tp,
-	struct xfs_cui_log_item		*cuip)
-{
-	struct xfs_cud_log_item		*cudp;
-
-	cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
-	xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
-			  &xfs_cud_item_ops);
-	cudp->cud_cuip = cuip;
-	cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
-
-	xfs_trans_add_item(tp, &cudp->cud_item);
-	return cudp;
-}
-
-/*
- * Finish an refcount update and log it to the CUD. Note that the
- * transaction is marked dirty regardless of whether the refcount
- * update succeeds or fails to support the CUI/CUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_refcount_update(
-	struct xfs_trans		*tp,
-	struct xfs_cud_log_item		*cudp,
-	struct xfs_refcount_intent	*ri,
-	struct xfs_btree_cur		**pcur)
-{
-	int				error;
-
-	error = xfs_refcount_finish_one(tp, ri, pcur);
-
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the CUI and frees the CUD
-	 * 2.) shuts down the filesystem
-	 */
-	tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
-	set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
-	return error;
-}
-
 /* Sort refcount intents by AG. */
 static int
 xfs_refcount_update_diff_items(
@@ -318,9 +272,6 @@ xfs_refcount_update_log_item(
 	uint				next_extent;
 	struct xfs_phys_extent		*pmap;
 
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
-
 	/*
 	 * atomic_inc_return gives us the value after the increment;
 	 * we want to use it as an array index so we need to subtract 1 from
@@ -347,7 +298,6 @@ xfs_refcount_update_create_intent(
 
 	ASSERT(count > 0);
 
-	xfs_trans_add_item(tp, &cuip->cui_item);
 	if (sort)
 		list_sort(mp, items, xfs_refcount_update_diff_items);
 	list_for_each_entry(ri, items, ri_list)
@@ -362,7 +312,16 @@ xfs_refcount_update_create_done(
 	struct xfs_log_item		*intent,
 	unsigned int			count)
 {
-	return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item;
+	struct xfs_cui_log_item		*cuip = CUI_ITEM(intent);
+	struct xfs_cud_log_item		*cudp;
+
+	cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
+	xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
+			  &xfs_cud_item_ops);
+	cudp->cud_cuip = cuip;
+	cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+
+	return &cudp->cud_item;
 }
 
 /* Take a passive ref to the AG containing the space we're refcounting. */
@@ -397,10 +356,9 @@ xfs_refcount_update_finish_item(
 	int				error;
 
 	ri = container_of(item, struct xfs_refcount_intent, ri_list);
-	error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri,
-			state);
 
 	/* Did we run out of reservation?  Requeue what we didn't finish. */
+	error = xfs_refcount_finish_one(tp, ri, state);
 	if (!error && ri->ri_blockcount > 0) {
 		ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
 		       ri->ri_type == XFS_REFCOUNT_DECREASE);
@@ -433,16 +391,6 @@ xfs_refcount_update_cancel_item(
 	kmem_cache_free(xfs_refcount_intent_cache, ri);
 }
 
-const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
-	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
-	.create_intent	= xfs_refcount_update_create_intent,
-	.abort_intent	= xfs_refcount_update_abort_intent,
-	.create_done	= xfs_refcount_update_create_done,
-	.finish_item	= xfs_refcount_update_finish_item,
-	.finish_cleanup = xfs_refcount_finish_one_cleanup,
-	.cancel_item	= xfs_refcount_update_cancel_item,
-};
-
 /* Is this recovered CUI ok? */
 static inline bool
 xfs_cui_validate_phys(
@@ -468,23 +416,38 @@ xfs_cui_validate_phys(
 	return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
 }
 
+static inline void
+xfs_cui_recover_work(
+	struct xfs_mount		*mp,
+	struct xfs_defer_pending	*dfp,
+	struct xfs_phys_extent		*pmap)
+{
+	struct xfs_refcount_intent	*ri;
+
+	ri = kmem_cache_alloc(xfs_refcount_intent_cache,
+			GFP_NOFS | __GFP_NOFAIL);
+	ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
+	ri->ri_startblock = pmap->pe_startblock;
+	ri->ri_blockcount = pmap->pe_len;
+	xfs_refcount_update_get_group(mp, ri);
+
+	xfs_defer_add_item(dfp, &ri->ri_list);
+}
+
 /*
  * Process a refcount update intent item that was recovered from the log.
  * We need to update the refcountbt.
  */
 STATIC int
-xfs_cui_item_recover(
-	struct xfs_log_item		*lip,
+xfs_refcount_recover_work(
+	struct xfs_defer_pending	*dfp,
 	struct list_head		*capture_list)
 {
 	struct xfs_trans_res		resv;
+	struct xfs_log_item		*lip = dfp->dfp_intent;
 	struct xfs_cui_log_item		*cuip = CUI_ITEM(lip);
-	struct xfs_cud_log_item		*cudp;
 	struct xfs_trans		*tp;
-	struct xfs_btree_cur		*rcur = NULL;
 	struct xfs_mount		*mp = lip->li_log->l_mp;
-	unsigned int			refc_type;
-	bool				requeue_only = false;
 	int				i;
 	int				error = 0;
 
@@ -501,6 +464,8 @@ xfs_cui_item_recover(
 					sizeof(cuip->cui_format));
 			return -EFSCORRUPTED;
 		}
+
+		xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]);
 	}
 
 	/*
@@ -521,100 +486,28 @@ xfs_cui_item_recover(
 	if (error)
 		return error;
 
-	cudp = xfs_trans_get_cud(tp, cuip);
-
-	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
-		struct xfs_refcount_intent	fake = { };
-		struct xfs_phys_extent		*pmap;
-
-		pmap = &cuip->cui_format.cui_extents[i];
-		refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
-		switch (refc_type) {
-		case XFS_REFCOUNT_INCREASE:
-		case XFS_REFCOUNT_DECREASE:
-		case XFS_REFCOUNT_ALLOC_COW:
-		case XFS_REFCOUNT_FREE_COW:
-			fake.ri_type = refc_type;
-			break;
-		default:
-			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
-					&cuip->cui_format,
-					sizeof(cuip->cui_format));
-			error = -EFSCORRUPTED;
-			goto abort_error;
-		}
-
-		fake.ri_startblock = pmap->pe_startblock;
-		fake.ri_blockcount = pmap->pe_len;
-
-		if (!requeue_only) {
-			xfs_refcount_update_get_group(mp, &fake);
-			error = xfs_trans_log_finish_refcount_update(tp, cudp,
-					&fake, &rcur);
-			xfs_refcount_update_put_group(&fake);
-		}
-		if (error == -EFSCORRUPTED)
-			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
-					&cuip->cui_format,
-					sizeof(cuip->cui_format));
-		if (error)
-			goto abort_error;
-
-		/* Requeue what we didn't finish. */
-		if (fake.ri_blockcount > 0) {
-			struct xfs_bmbt_irec	irec = {
-				.br_startblock	= fake.ri_startblock,
-				.br_blockcount	= fake.ri_blockcount,
-			};
-
-			switch (fake.ri_type) {
-			case XFS_REFCOUNT_INCREASE:
-				xfs_refcount_increase_extent(tp, &irec);
-				break;
-			case XFS_REFCOUNT_DECREASE:
-				xfs_refcount_decrease_extent(tp, &irec);
-				break;
-			case XFS_REFCOUNT_ALLOC_COW:
-				xfs_refcount_alloc_cow_extent(tp,
-						irec.br_startblock,
-						irec.br_blockcount);
-				break;
-			case XFS_REFCOUNT_FREE_COW:
-				xfs_refcount_free_cow_extent(tp,
-						irec.br_startblock,
-						irec.br_blockcount);
-				break;
-			default:
-				ASSERT(0);
-			}
-			requeue_only = true;
-		}
-	}
+	error = xlog_recover_finish_intent(tp, dfp);
+	if (error == -EFSCORRUPTED)
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				&cuip->cui_format,
+				sizeof(cuip->cui_format));
+	if (error)
+		goto abort_error;
 
-	xfs_refcount_finish_one_cleanup(tp, rcur, error);
 	return xfs_defer_ops_capture_and_commit(tp, capture_list);
 
 abort_error:
-	xfs_refcount_finish_one_cleanup(tp, rcur, error);
 	xfs_trans_cancel(tp);
 	return error;
 }
 
-STATIC bool
-xfs_cui_item_match(
-	struct xfs_log_item	*lip,
-	uint64_t		intent_id)
-{
-	return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
-}
-
 /* Relog an intent item to push the log tail forward. */
 static struct xfs_log_item *
-xfs_cui_item_relog(
+xfs_refcount_relog_intent(
+	struct xfs_trans		*tp,
 	struct xfs_log_item		*intent,
-	struct xfs_trans		*tp)
+	struct xfs_log_item		*done_item)
 {
-	struct xfs_cud_log_item		*cudp;
 	struct xfs_cui_log_item		*cuip;
 	struct xfs_phys_extent		*pmap;
 	unsigned int			count;
@@ -622,27 +515,41 @@ xfs_cui_item_relog(
 	count = CUI_ITEM(intent)->cui_format.cui_nextents;
 	pmap = CUI_ITEM(intent)->cui_format.cui_extents;
 
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
-	set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
 	cuip = xfs_cui_init(tp->t_mountp, count);
 	memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
 	atomic_set(&cuip->cui_next_extent, count);
-	xfs_trans_add_item(tp, &cuip->cui_item);
-	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+
 	return &cuip->cui_item;
 }
 
+const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+	.name		= "refcount",
+	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_refcount_update_create_intent,
+	.abort_intent	= xfs_refcount_update_abort_intent,
+	.create_done	= xfs_refcount_update_create_done,
+	.finish_item	= xfs_refcount_update_finish_item,
+	.finish_cleanup = xfs_refcount_finish_one_cleanup,
+	.cancel_item	= xfs_refcount_update_cancel_item,
+	.recover_work	= xfs_refcount_recover_work,
+	.relog_intent	= xfs_refcount_relog_intent,
+};
+
+STATIC bool
+xfs_cui_item_match(
+	struct xfs_log_item	*lip,
+	uint64_t		intent_id)
+{
+	return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
+}
+
 static const struct xfs_item_ops xfs_cui_item_ops = {
 	.flags		= XFS_ITEM_INTENT,
 	.iop_size	= xfs_cui_item_size,
 	.iop_format	= xfs_cui_item_format,
 	.iop_unpin	= xfs_cui_item_unpin,
 	.iop_release	= xfs_cui_item_release,
-	.iop_recover	= xfs_cui_item_recover,
 	.iop_match	= xfs_cui_item_match,
-	.iop_relog	= xfs_cui_item_relog,
 };
 
 static inline void
@@ -696,12 +603,9 @@ xlog_recover_cui_commit_pass2(
 	cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
 	xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
 	atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
-	/*
-	 * Insert the intent into the AIL directly and drop one reference so
-	 * that finishing or canceling the work will drop the other.
-	 */
-	xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn);
-	xfs_cui_release(cuip);
+
+	xlog_recover_intent_item(log, &cuip->cui_item, lsn,
+			&xfs_refcount_update_defer_type);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e5b62dc28466..d5ca8bcae65b 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -618,7 +618,7 @@ xfs_reflink_cancel_cow_blocks(
 
 			error = xfs_free_extent_later(*tpp, del.br_startblock,
 					del.br_blockcount, NULL,
-					XFS_AG_RESV_NONE);
+					XFS_AG_RESV_NONE, false);
 			if (error)
 				break;
 
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 0e0e747028da..79ad0087aeca 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -225,23 +225,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = {
 	.iop_intent	= xfs_rud_item_intent,
 };
 
-static struct xfs_rud_log_item *
-xfs_trans_get_rud(
-	struct xfs_trans		*tp,
-	struct xfs_rui_log_item		*ruip)
-{
-	struct xfs_rud_log_item		*rudp;
-
-	rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
-	xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
-			  &xfs_rud_item_ops);
-	rudp->rud_ruip = ruip;
-	rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
-
-	xfs_trans_add_item(tp, &rudp->rud_item);
-	return rudp;
-}
-
 /* Set the map extent flags for this reverse mapping. */
 static void
 xfs_trans_set_rmap_flags(
@@ -285,35 +268,6 @@ xfs_trans_set_rmap_flags(
 	}
 }
 
-/*
- * Finish an rmap update and log it to the RUD. Note that the transaction is
- * marked dirty regardless of whether the rmap update succeeds or fails to
- * support the RUI/RUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_rmap_update(
-	struct xfs_trans		*tp,
-	struct xfs_rud_log_item		*rudp,
-	struct xfs_rmap_intent		*ri,
-	struct xfs_btree_cur		**pcur)
-{
-	int				error;
-
-	error = xfs_rmap_finish_one(tp, ri, pcur);
-
-	/*
-	 * Mark the transaction dirty, even on error. This ensures the
-	 * transaction is aborted, which:
-	 *
-	 * 1.) releases the RUI and frees the RUD
-	 * 2.) shuts down the filesystem
-	 */
-	tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
-	set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
-	return error;
-}
-
 /* Sort rmap intents by AG. */
 static int
 xfs_rmap_update_diff_items(
@@ -340,9 +294,6 @@ xfs_rmap_update_log_item(
 	uint				next_extent;
 	struct xfs_map_extent		*map;
 
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
-
 	/*
 	 * atomic_inc_return gives us the value after the increment;
 	 * we want to use it as an array index so we need to subtract 1 from
@@ -372,7 +323,6 @@ xfs_rmap_update_create_intent(
 
 	ASSERT(count > 0);
 
-	xfs_trans_add_item(tp, &ruip->rui_item);
 	if (sort)
 		list_sort(mp, items, xfs_rmap_update_diff_items);
 	list_for_each_entry(ri, items, ri_list)
@@ -387,7 +337,16 @@ xfs_rmap_update_create_done(
 	struct xfs_log_item		*intent,
 	unsigned int			count)
 {
-	return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item;
+	struct xfs_rui_log_item		*ruip = RUI_ITEM(intent);
+	struct xfs_rud_log_item		*rudp;
+
+	rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
+	xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
+			  &xfs_rud_item_ops);
+	rudp->rud_ruip = ruip;
+	rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+
+	return &rudp->rud_item;
 }
 
 /* Take a passive ref to the AG containing the space we're rmapping. */
@@ -423,8 +382,7 @@ xfs_rmap_update_finish_item(
 
 	ri = container_of(item, struct xfs_rmap_intent, ri_list);
 
-	error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
-			state);
+	error = xfs_rmap_finish_one(tp, ri, state);
 
 	xfs_rmap_update_put_group(ri);
 	kmem_cache_free(xfs_rmap_intent_cache, ri);
@@ -452,16 +410,6 @@ xfs_rmap_update_cancel_item(
 	kmem_cache_free(xfs_rmap_intent_cache, ri);
 }
 
-const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
-	.max_items	= XFS_RUI_MAX_FAST_EXTENTS,
-	.create_intent	= xfs_rmap_update_create_intent,
-	.abort_intent	= xfs_rmap_update_abort_intent,
-	.create_done	= xfs_rmap_update_create_done,
-	.finish_item	= xfs_rmap_update_finish_item,
-	.finish_cleanup = xfs_rmap_finish_one_cleanup,
-	.cancel_item	= xfs_rmap_update_cancel_item,
-};
-
 /* Is this recovered RUI ok? */
 static inline bool
 xfs_rui_validate_map(
@@ -498,20 +446,72 @@ xfs_rui_validate_map(
 	return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
 }
 
+static inline void
+xfs_rui_recover_work(
+	struct xfs_mount		*mp,
+	struct xfs_defer_pending	*dfp,
+	const struct xfs_map_extent	*map)
+{
+	struct xfs_rmap_intent		*ri;
+
+	ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+
+	switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+	case XFS_RMAP_EXTENT_MAP:
+		ri->ri_type = XFS_RMAP_MAP;
+		break;
+	case XFS_RMAP_EXTENT_MAP_SHARED:
+		ri->ri_type = XFS_RMAP_MAP_SHARED;
+		break;
+	case XFS_RMAP_EXTENT_UNMAP:
+		ri->ri_type = XFS_RMAP_UNMAP;
+		break;
+	case XFS_RMAP_EXTENT_UNMAP_SHARED:
+		ri->ri_type = XFS_RMAP_UNMAP_SHARED;
+		break;
+	case XFS_RMAP_EXTENT_CONVERT:
+		ri->ri_type = XFS_RMAP_CONVERT;
+		break;
+	case XFS_RMAP_EXTENT_CONVERT_SHARED:
+		ri->ri_type = XFS_RMAP_CONVERT_SHARED;
+		break;
+	case XFS_RMAP_EXTENT_ALLOC:
+		ri->ri_type = XFS_RMAP_ALLOC;
+		break;
+	case XFS_RMAP_EXTENT_FREE:
+		ri->ri_type = XFS_RMAP_FREE;
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	ri->ri_owner = map->me_owner;
+	ri->ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
+			XFS_ATTR_FORK : XFS_DATA_FORK;
+	ri->ri_bmap.br_startblock = map->me_startblock;
+	ri->ri_bmap.br_startoff = map->me_startoff;
+	ri->ri_bmap.br_blockcount = map->me_len;
+	ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
+			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+	xfs_rmap_update_get_group(mp, ri);
+
+	xfs_defer_add_item(dfp, &ri->ri_list);
+}
+
 /*
  * Process an rmap update intent item that was recovered from the log.
  * We need to update the rmapbt.
  */
 STATIC int
-xfs_rui_item_recover(
-	struct xfs_log_item		*lip,
+xfs_rmap_recover_work(
+	struct xfs_defer_pending	*dfp,
 	struct list_head		*capture_list)
 {
 	struct xfs_trans_res		resv;
+	struct xfs_log_item		*lip = dfp->dfp_intent;
 	struct xfs_rui_log_item		*ruip = RUI_ITEM(lip);
-	struct xfs_rud_log_item		*rudp;
 	struct xfs_trans		*tp;
-	struct xfs_btree_cur		*rcur = NULL;
 	struct xfs_mount		*mp = lip->li_log->l_mp;
 	int				i;
 	int				error = 0;
@@ -529,6 +529,8 @@ xfs_rui_item_recover(
 					sizeof(ruip->rui_format));
 			return -EFSCORRUPTED;
 		}
+
+		xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]);
 	}
 
 	resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -536,91 +538,29 @@ xfs_rui_item_recover(
 			XFS_TRANS_RESERVE, &tp);
 	if (error)
 		return error;
-	rudp = xfs_trans_get_rud(tp, ruip);
 
-	for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
-		struct xfs_rmap_intent	fake = { };
-		struct xfs_map_extent	*map;
-
-		map = &ruip->rui_format.rui_extents[i];
-		switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
-		case XFS_RMAP_EXTENT_MAP:
-			fake.ri_type = XFS_RMAP_MAP;
-			break;
-		case XFS_RMAP_EXTENT_MAP_SHARED:
-			fake.ri_type = XFS_RMAP_MAP_SHARED;
-			break;
-		case XFS_RMAP_EXTENT_UNMAP:
-			fake.ri_type = XFS_RMAP_UNMAP;
-			break;
-		case XFS_RMAP_EXTENT_UNMAP_SHARED:
-			fake.ri_type = XFS_RMAP_UNMAP_SHARED;
-			break;
-		case XFS_RMAP_EXTENT_CONVERT:
-			fake.ri_type = XFS_RMAP_CONVERT;
-			break;
-		case XFS_RMAP_EXTENT_CONVERT_SHARED:
-			fake.ri_type = XFS_RMAP_CONVERT_SHARED;
-			break;
-		case XFS_RMAP_EXTENT_ALLOC:
-			fake.ri_type = XFS_RMAP_ALLOC;
-			break;
-		case XFS_RMAP_EXTENT_FREE:
-			fake.ri_type = XFS_RMAP_FREE;
-			break;
-		default:
-			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
-					&ruip->rui_format,
-					sizeof(ruip->rui_format));
-			error = -EFSCORRUPTED;
-			goto abort_error;
-		}
-
-		fake.ri_owner = map->me_owner;
-		fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
-				XFS_ATTR_FORK : XFS_DATA_FORK;
-		fake.ri_bmap.br_startblock = map->me_startblock;
-		fake.ri_bmap.br_startoff = map->me_startoff;
-		fake.ri_bmap.br_blockcount = map->me_len;
-		fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
-				XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-
-		xfs_rmap_update_get_group(mp, &fake);
-		error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake,
-				&rcur);
-		if (error == -EFSCORRUPTED)
-			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
-					map, sizeof(*map));
-		xfs_rmap_update_put_group(&fake);
-		if (error)
-			goto abort_error;
-
-	}
+	error = xlog_recover_finish_intent(tp, dfp);
+	if (error == -EFSCORRUPTED)
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				&ruip->rui_format,
+				sizeof(ruip->rui_format));
+	if (error)
+		goto abort_error;
 
-	xfs_rmap_finish_one_cleanup(tp, rcur, error);
 	return xfs_defer_ops_capture_and_commit(tp, capture_list);
 
 abort_error:
-	xfs_rmap_finish_one_cleanup(tp, rcur, error);
 	xfs_trans_cancel(tp);
 	return error;
 }
 
-STATIC bool
-xfs_rui_item_match(
-	struct xfs_log_item	*lip,
-	uint64_t		intent_id)
-{
-	return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
-}
-
 /* Relog an intent item to push the log tail forward. */
 static struct xfs_log_item *
-xfs_rui_item_relog(
+xfs_rmap_relog_intent(
+	struct xfs_trans		*tp,
 	struct xfs_log_item		*intent,
-	struct xfs_trans		*tp)
+	struct xfs_log_item		*done_item)
 {
-	struct xfs_rud_log_item		*rudp;
 	struct xfs_rui_log_item		*ruip;
 	struct xfs_map_extent		*map;
 	unsigned int			count;
@@ -628,27 +568,41 @@ xfs_rui_item_relog(
 	count = RUI_ITEM(intent)->rui_format.rui_nextents;
 	map = RUI_ITEM(intent)->rui_format.rui_extents;
 
-	tp->t_flags |= XFS_TRANS_DIRTY;
-	rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
-	set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
 	ruip = xfs_rui_init(tp->t_mountp, count);
 	memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map));
 	atomic_set(&ruip->rui_next_extent, count);
-	xfs_trans_add_item(tp, &ruip->rui_item);
-	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+
 	return &ruip->rui_item;
 }
 
+const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+	.name		= "rmap",
+	.max_items	= XFS_RUI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_rmap_update_create_intent,
+	.abort_intent	= xfs_rmap_update_abort_intent,
+	.create_done	= xfs_rmap_update_create_done,
+	.finish_item	= xfs_rmap_update_finish_item,
+	.finish_cleanup = xfs_rmap_finish_one_cleanup,
+	.cancel_item	= xfs_rmap_update_cancel_item,
+	.recover_work	= xfs_rmap_recover_work,
+	.relog_intent	= xfs_rmap_relog_intent,
+};
+
+STATIC bool
+xfs_rui_item_match(
+	struct xfs_log_item	*lip,
+	uint64_t		intent_id)
+{
+	return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
+}
+
 static const struct xfs_item_ops xfs_rui_item_ops = {
 	.flags		= XFS_ITEM_INTENT,
 	.iop_size	= xfs_rui_item_size,
 	.iop_format	= xfs_rui_item_format,
 	.iop_unpin	= xfs_rui_item_unpin,
 	.iop_release	= xfs_rui_item_release,
-	.iop_recover	= xfs_rui_item_recover,
 	.iop_match	= xfs_rui_item_match,
-	.iop_relog	= xfs_rui_item_relog,
 };
 
 static inline void
@@ -702,12 +656,9 @@ xlog_recover_rui_commit_pass2(
 	ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
 	xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
 	atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
-	/*
-	 * Insert the intent into the AIL directly and drop one reference so
-	 * that finishing or canceling the work will drop the other.
-	 */
-	xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn);
-	xfs_rui_release(ruip);
+
+	xlog_recover_intent_item(log, &ruip->rui_item, lsn,
+			&xfs_rmap_update_defer_type);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 88c48de5c9c8..8649d981a097 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -14,28 +14,14 @@
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
 #include "xfs_trans.h"
 #include "xfs_trans_space.h"
 #include "xfs_icache.h"
 #include "xfs_rtalloc.h"
 #include "xfs_sb.h"
 #include "xfs_rtbitmap.h"
-
-/*
- * Read and return the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-static int
-xfs_rtget_summary(
-	struct xfs_rtalloc_args	*args,
-	int			log,	/* log2 of extent size */
-	xfs_fileoff_t		bbno,	/* bitmap block number */
-	xfs_suminfo_t		*sum)	/* out: summary info for this block */
-{
-	return xfs_rtmodify_summary_int(args, log, bbno, 0, sum);
-}
+#include "xfs_quota.h"
 
 /*
  * Return whether there are any free extents in the size range given
@@ -154,56 +140,55 @@ xfs_rtallocate_range(
 	 * properly update the summary.
 	 */
 	error = xfs_rtfind_back(args, start, 0, &preblock);
-	if (error) {
+	if (error)
 		return error;
-	}
+
 	/*
 	 * Find the next allocated block (end of free extent).
 	 */
 	error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
 			&postblock);
-	if (error) {
+	if (error)
 		return error;
-	}
+
 	/*
 	 * Decrement the summary information corresponding to the entire
 	 * (old) free extent.
 	 */
 	error = xfs_rtmodify_summary(args,
-			XFS_RTBLOCKLOG(postblock + 1 - preblock),
+			xfs_highbit64(postblock + 1 - preblock),
 			xfs_rtx_to_rbmblock(mp, preblock), -1);
-	if (error) {
+	if (error)
 		return error;
-	}
+
 	/*
 	 * If there are blocks not being allocated at the front of the
 	 * old extent, add summary data for them to be free.
 	 */
 	if (preblock < start) {
 		error = xfs_rtmodify_summary(args,
-				XFS_RTBLOCKLOG(start - preblock),
+				xfs_highbit64(start - preblock),
 				xfs_rtx_to_rbmblock(mp, preblock), 1);
-		if (error) {
+		if (error)
 			return error;
-		}
 	}
+
 	/*
 	 * If there are blocks not being allocated at the end of the
 	 * old extent, add summary data for them to be free.
 	 */
 	if (postblock > end) {
 		error = xfs_rtmodify_summary(args,
-				XFS_RTBLOCKLOG(postblock - end),
+				xfs_highbit64(postblock - end),
 				xfs_rtx_to_rbmblock(mp, end + 1), 1);
-		if (error) {
+		if (error)
 			return error;
-		}
 	}
+
 	/*
 	 * Modify the bitmap to mark this extent allocated.
 	 */
-	error = xfs_rtmodify_range(args, start, len, 0);
-	return error;
+	return xfs_rtmodify_range(args, start, len, 0);
 }
 
 /*
@@ -265,21 +250,17 @@ xfs_rtallocate_extent_block(
 		 * If it's not so then next will contain the first non-free.
 		 */
 		error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat);
-		if (error) {
+		if (error)
 			return error;
-		}
 		if (stat) {
 			/*
 			 * i for maxlen is all free, allocate and return that.
 			 */
-			error = xfs_rtallocate_range(args, i, maxlen);
-			if (error) {
-				return error;
-			}
-			*len = maxlen;
-			*rtx = i;
-			return 0;
+			bestlen = maxlen;
+			besti = i;
+			goto allocate;
 		}
+
 		/*
 		 * In the case where we have a variable-sized allocation
 		 * request, figure out how big this free piece is,
@@ -298,45 +279,44 @@ xfs_rtallocate_extent_block(
 		/*
 		 * If not done yet, find the start of the next free space.
 		 */
-		if (next < end) {
-			error = xfs_rtfind_forw(args, next, end, &i);
-			if (error) {
-				return error;
-			}
-		} else
+		if (next >= end)
 			break;
+		error = xfs_rtfind_forw(args, next, end, &i);
+		if (error)
+			return error;
 	}
+
 	/*
 	 * Searched the whole thing & didn't find a maxlen free extent.
 	 */
-	if (minlen < maxlen && besti != -1) {
-		xfs_rtxlen_t	p;	/* amount to trim length by */
-
+	if (minlen > maxlen || besti == -1) {
 		/*
-		 * If size should be a multiple of prod, make that so.
+		 * Allocation failed.  Set *nextp to the next block to try.
 		 */
-		if (prod > 1) {
-			div_u64_rem(bestlen, prod, &p);
-			if (p)
-				bestlen -= p;
-		}
+		*nextp = next;
+		return -ENOSPC;
+	}
 
-		/*
-		 * Allocate besti for bestlen & return that.
-		 */
-		error = xfs_rtallocate_range(args, besti, bestlen);
-		if (error) {
-			return error;
-		}
-		*len = bestlen;
-		*rtx = besti;
-		return 0;
+	/*
+	 * If size should be a multiple of prod, make that so.
+	 */
+	if (prod > 1) {
+		xfs_rtxlen_t	p;	/* amount to trim length by */
+
+		div_u64_rem(bestlen, prod, &p);
+		if (p)
+			bestlen -= p;
 	}
+
 	/*
-	 * Allocation failed.  Set *nextp to the next block to try.
+	 * Allocate besti for bestlen & return that.
 	 */
-	*nextp = next;
-	*rtx = NULLRTEXTNO;
+allocate:
+	error = xfs_rtallocate_range(args, besti, bestlen);
+	if (error)
+		return error;
+	*len = bestlen;
+	*rtx = besti;
 	return 0;
 }
 
@@ -367,52 +347,33 @@ xfs_rtallocate_extent_exact(
 	 * Check if the range in question (for maxlen) is free.
 	 */
 	error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree);
-	if (error) {
+	if (error)
 		return error;
-	}
-	if (isfree) {
+
+	if (!isfree) {
 		/*
-		 * If it is, allocate it and return success.
+		 * If not, allocate what there is, if it's at least minlen.
 		 */
-		error = xfs_rtallocate_range(args, start, maxlen);
-		if (error) {
-			return error;
-		}
-		*len = maxlen;
-		*rtx = start;
-		return 0;
-	}
-	/*
-	 * If not, allocate what there is, if it's at least minlen.
-	 */
-	maxlen = next - start;
-	if (maxlen < minlen) {
+		maxlen = next - start;
+		if (maxlen < minlen)
+			return -ENOSPC;
+
 		/*
-		 * Failed, return failure status.
+		 * Trim off tail of extent, if prod is specified.
 		 */
-		*rtx = NULLRTEXTNO;
-		return 0;
-	}
-	/*
-	 * Trim off tail of extent, if prod is specified.
-	 */
-	if (prod > 1 && (i = maxlen % prod)) {
-		maxlen -= i;
-		if (maxlen < minlen) {
-			/*
-			 * Now we can't do it, return failure status.
-			 */
-			*rtx = NULLRTEXTNO;
-			return 0;
+		if (prod > 1 && (i = maxlen % prod)) {
+			maxlen -= i;
+			if (maxlen < minlen)
+				return -ENOSPC;
 		}
 	}
+
 	/*
 	 * Allocate what we can and return it.
 	 */
 	error = xfs_rtallocate_range(args, start, maxlen);
-	if (error) {
+	if (error)
 		return error;
-	}
 	*len = maxlen;
 	*rtx = start;
 	return 0;
@@ -441,7 +402,6 @@ xfs_rtallocate_extent_near(
 	int			j;	/* secondary loop control */
 	int			log2len; /* log2 of minlen */
 	xfs_rtxnum_t		n;	/* next rtext to try */
-	xfs_rtxnum_t		r;	/* result rtext */
 
 	ASSERT(minlen % prod == 0);
 	ASSERT(maxlen % prod == 0);
@@ -455,26 +415,18 @@ xfs_rtallocate_extent_near(
 
 	/* Make sure we don't run off the end of the rt volume. */
 	maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod);
-	if (maxlen < minlen) {
-		*rtx = NULLRTEXTNO;
-		return 0;
-	}
+	if (maxlen < minlen)
+		return -ENOSPC;
 
 	/*
 	 * Try the exact allocation first.
 	 */
 	error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len,
-			prod, &r);
-	if (error) {
+			prod, rtx);
+	if (error != -ENOSPC)
 		return error;
-	}
-	/*
-	 * If the exact allocation worked, return that.
-	 */
-	if (r != NULLRTEXTNO) {
-		*rtx = r;
-		return 0;
-	}
+
+
 	bbno = xfs_rtx_to_rbmblock(mp, start);
 	i = 0;
 	j = -1;
@@ -490,9 +442,9 @@ xfs_rtallocate_extent_near(
 		 */
 		error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1,
 				bbno + i, &maxlog);
-		if (error) {
+		if (error)
 			return error;
-		}
+
 		/*
 		 * If there are any useful extents starting here, try
 		 * allocating one.
@@ -511,17 +463,9 @@ xfs_rtallocate_extent_near(
 				 */
 				error = xfs_rtallocate_extent_block(args,
 						bbno + i, minlen, maxavail, len,
-						&n, prod, &r);
-				if (error) {
+						&n, prod, rtx);
+				if (error != -ENOSPC)
 					return error;
-				}
-				/*
-				 * If it worked, return it.
-				 */
-				if (r != NULLRTEXTNO) {
-					*rtx = r;
-					return 0;
-				}
 			}
 			/*
 			 * On the negative side of the starting location.
@@ -555,17 +499,9 @@ xfs_rtallocate_extent_near(
 					error = xfs_rtallocate_extent_block(args,
 							bbno + j, minlen,
 							maxavail, len, &n, prod,
-							&r);
-					if (error) {
+							rtx);
+					if (error != -ENOSPC)
 						return error;
-					}
-					/*
-					 * If it works, return the extent.
-					 */
-					if (r != NULLRTEXTNO) {
-						*rtx = r;
-						return 0;
-					}
 				}
 			}
 		}
@@ -599,8 +535,53 @@ xfs_rtallocate_extent_near(
 		else
 			break;
 	}
-	*rtx = NULLRTEXTNO;
-	return 0;
+	return -ENOSPC;
+}
+
+static int
+xfs_rtalloc_sumlevel(
+	struct xfs_rtalloc_args	*args,
+	int			l,	/* level number */
+	xfs_rtxlen_t		minlen,	/* minimum length to allocate */
+	xfs_rtxlen_t		maxlen,	/* maximum length to allocate */
+	xfs_rtxlen_t		prod,	/* extent product factor */
+	xfs_rtxlen_t		*len,	/* out: actual length allocated */
+	xfs_rtxnum_t		*rtx)	/* out: start rtext allocated */
+{
+	xfs_fileoff_t		i;	/* bitmap block number */
+
+	for (i = 0; i < args->mp->m_sb.sb_rbmblocks; i++) {
+		xfs_suminfo_t	sum;	/* summary information for extents */
+		xfs_rtxnum_t	n;	/* next rtext to be tried */
+		int		error;
+
+		error = xfs_rtget_summary(args, l, i, &sum);
+		if (error)
+			return error;
+
+		/*
+		 * Nothing there, on to the next block.
+		 */
+		if (!sum)
+			continue;
+
+		/*
+		 * Try allocating the extent.
+		 */
+		error = xfs_rtallocate_extent_block(args, i, minlen, maxlen,
+				len, &n, prod, rtx);
+		if (error != -ENOSPC)
+			return error;
+
+		/*
+		 * If the "next block to try" returned from the allocator is
+		 * beyond the next bitmap block, skip to that bitmap block.
+		 */
+		if (xfs_rtx_to_rbmblock(args->mp, n) > i + 1)
+			i = xfs_rtx_to_rbmblock(args->mp, n) - 1;
+	}
+
+	return -ENOSPC;
 }
 
 /*
@@ -617,13 +598,8 @@ xfs_rtallocate_extent_size(
 	xfs_rtxlen_t		prod,	/* extent product factor */
 	xfs_rtxnum_t		*rtx)	/* out: start rtext allocated */
 {
-	struct xfs_mount	*mp = args->mp;
 	int			error;
-	xfs_fileoff_t		i;	/* bitmap block number */
 	int			l;	/* level number (loop control) */
-	xfs_rtxnum_t		n;	/* next rtext to be tried */
-	xfs_rtxnum_t		r;	/* result rtext number */
-	xfs_suminfo_t		sum;	/* summary information for extents */
 
 	ASSERT(minlen % prod == 0);
 	ASSERT(maxlen % prod == 0);
@@ -631,119 +607,46 @@ xfs_rtallocate_extent_size(
 
 	/*
 	 * Loop over all the levels starting with maxlen.
-	 * At each level, look at all the bitmap blocks, to see if there
-	 * are extents starting there that are long enough (>= maxlen).
-	 * Note, only on the initial level can the allocation fail if
-	 * the summary says there's an extent.
+	 *
+	 * At each level, look at all the bitmap blocks, to see if there are
+	 * extents starting there that are long enough (>= maxlen).
+	 *
+	 * Note, only on the initial level can the allocation fail if the
+	 * summary says there's an extent.
 	 */
-	for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
-		/*
-		 * Loop over all the bitmap blocks.
-		 */
-		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
-			/*
-			 * Get the summary for this level/block.
-			 */
-			error = xfs_rtget_summary(args, l, i, &sum);
-			if (error) {
-				return error;
-			}
-			/*
-			 * Nothing there, on to the next block.
-			 */
-			if (!sum)
-				continue;
-			/*
-			 * Try allocating the extent.
-			 */
-			error = xfs_rtallocate_extent_block(args, i, maxlen,
-					maxlen, len, &n, prod, &r);
-			if (error) {
-				return error;
-			}
-			/*
-			 * If it worked, return that.
-			 */
-			if (r != NULLRTEXTNO) {
-				*rtx = r;
-				return 0;
-			}
-			/*
-			 * If the "next block to try" returned from the
-			 * allocator is beyond the next bitmap block,
-			 * skip to that bitmap block.
-			 */
-			if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
-				i = xfs_rtx_to_rbmblock(mp, n) - 1;
-		}
+	for (l = xfs_highbit32(maxlen); l < args->mp->m_rsumlevels; l++) {
+		error = xfs_rtalloc_sumlevel(args, l, minlen, maxlen, prod, len,
+				rtx);
+		if (error != -ENOSPC)
+			return error;
 	}
+
 	/*
-	 * Didn't find any maxlen blocks.  Try smaller ones, unless
-	 * we're asking for a fixed size extent.
+	 * Didn't find any maxlen blocks.  Try smaller ones, unless we are
+	 * looking for a fixed size extent.
 	 */
-	if (minlen > --maxlen) {
-		*rtx = NULLRTEXTNO;
-		return 0;
-	}
+	if (minlen > --maxlen)
+		return -ENOSPC;
 	ASSERT(minlen != 0);
 	ASSERT(maxlen != 0);
 
 	/*
 	 * Loop over sizes, from maxlen down to minlen.
-	 * This time, when we do the allocations, allow smaller ones
-	 * to succeed.
+	 *
+	 * This time, when we do the allocations, allow smaller ones to succeed,
+	 * but make sure the specified minlen/maxlen are in the possible range
+	 * for this summary level.
 	 */
 	for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
-		/*
-		 * Loop over all the bitmap blocks, try an allocation
-		 * starting in that block.
-		 */
-		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
-			/*
-			 * Get the summary information for this level/block.
-			 */
-			error =	xfs_rtget_summary(args, l, i, &sum);
-			if (error) {
-				return error;
-			}
-			/*
-			 * If nothing there, go on to next.
-			 */
-			if (!sum)
-				continue;
-			/*
-			 * Try the allocation.  Make sure the specified
-			 * minlen/maxlen are in the possible range for
-			 * this summary level.
-			 */
-			error = xfs_rtallocate_extent_block(args, i,
-					XFS_RTMAX(minlen, 1 << l),
-					XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
-					len, &n, prod, &r);
-			if (error) {
-				return error;
-			}
-			/*
-			 * If it worked, return that extent.
-			 */
-			if (r != NULLRTEXTNO) {
-				*rtx = r;
-				return 0;
-			}
-			/*
-			 * If the "next block to try" returned from the
-			 * allocator is beyond the next bitmap block,
-			 * skip to that bitmap block.
-			 */
-			if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
-				i = xfs_rtx_to_rbmblock(mp, n) - 1;
-		}
+		error = xfs_rtalloc_sumlevel(args, l,
+				max_t(xfs_rtxlen_t, minlen, 1 << l),
+				min_t(xfs_rtxlen_t, maxlen, (1 << (l + 1)) - 1),
+				prod, len, rtx);
+		if (error != -ENOSPC)
+			return error;
 	}
-	/*
-	 * Got nothing, return failure.
-	 */
-	*rtx = NULLRTEXTNO;
-	return 0;
+
+	return -ENOSPC;
 }
 
 /*
@@ -963,8 +866,10 @@ xfs_growfs_rt(
 	 */
 	nrextents = nrblocks;
 	do_div(nrextents, in->extsize);
+	if (!xfs_validate_rtextents(nrextents))
+		return -EINVAL;
 	nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
-	nrextslog = xfs_highbit32(nrextents);
+	nrextslog = xfs_compute_rextslog(nrextents);
 	nrsumlevels = nrextslog + 1;
 	nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks);
 	nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
@@ -1031,11 +936,14 @@ xfs_growfs_rt(
 		nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
 		nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks);
 		ASSERT(nsbp->sb_rextents != 0);
-		nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+		nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents);
 		nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
 		nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels,
 				nsbp->sb_rbmblocks);
 		nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+		/* recompute growfsrt reservation from new rsumsize */
+		xfs_trans_resv_calc(nmp, &nmp->m_resv);
+
 		/*
 		 * Start a transaction, get the log reservation.
 		 */
@@ -1122,6 +1030,8 @@ error_cancel:
 		 */
 		mp->m_rsumlevels = nrsumlevels;
 		mp->m_rsumsize = nrsumsize;
+		/* recompute growfsrt reservation from new rsumsize */
+		xfs_trans_resv_calc(mp, &mp->m_resv);
 
 		error = xfs_trans_commit(tp);
 		if (error)
@@ -1160,81 +1070,6 @@ out_free:
 }
 
 /*
- * Allocate an extent in the realtime subvolume, with the usual allocation
- * parameters.  The length units are all in realtime extents, as is the
- * result block number.
- */
-int
-xfs_rtallocate_extent(
-	struct xfs_trans	*tp,
-	xfs_rtxnum_t		start,	/* starting rtext number to allocate */
-	xfs_rtxlen_t		minlen,	/* minimum length to allocate */
-	xfs_rtxlen_t		maxlen,	/* maximum length to allocate */
-	xfs_rtxlen_t		*len,	/* out: actual length allocated */
-	int			wasdel,	/* was a delayed allocation extent */
-	xfs_rtxlen_t		prod,	/* extent product factor */
-	xfs_rtxnum_t		*rtblock) /* out: start rtext allocated */
-{
-	struct xfs_rtalloc_args	args = {
-		.mp		= tp->t_mountp,
-		.tp		= tp,
-	};
-	int			error;	/* error value */
-	xfs_rtxnum_t		r;	/* result allocated rtext */
-
-	ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL));
-	ASSERT(minlen > 0 && minlen <= maxlen);
-
-	/*
-	 * If prod is set then figure out what to do to minlen and maxlen.
-	 */
-	if (prod > 1) {
-		xfs_rtxlen_t	i;
-
-		if ((i = maxlen % prod))
-			maxlen -= i;
-		if ((i = minlen % prod))
-			minlen += prod - i;
-		if (maxlen < minlen) {
-			*rtblock = NULLRTEXTNO;
-			return 0;
-		}
-	}
-
-retry:
-	if (start == 0) {
-		error = xfs_rtallocate_extent_size(&args, minlen,
-				maxlen, len, prod, &r);
-	} else {
-		error = xfs_rtallocate_extent_near(&args, start, minlen,
-				maxlen, len, prod, &r);
-	}
-
-	xfs_rtbuf_cache_relse(&args);
-	if (error)
-		return error;
-
-	/*
-	 * If it worked, update the superblock.
-	 */
-	if (r != NULLRTEXTNO) {
-		long	slen = (long)*len;
-
-		ASSERT(*len >= minlen && *len <= maxlen);
-		if (wasdel)
-			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
-		else
-			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
-	} else if (prod > 1) {
-		prod = 1;
-		goto retry;
-	}
-
-	*rtblock = r;
-	return 0;
-}
-
-/*
  * Initialize realtime fields in the mount structure.
  */
 int				/* error */
@@ -1412,7 +1247,7 @@ xfs_rtunmount_inodes(
  * of rtextents and the fraction.
  * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
  */
-int						/* error */
+static int
 xfs_rtpick_extent(
 	xfs_mount_t		*mp,		/* file system mount point */
 	xfs_trans_t		*tp,		/* transaction pointer */
@@ -1451,3 +1286,177 @@ xfs_rtpick_extent(
 	*pick = b;
 	return 0;
 }
+
+static void
+xfs_rtalloc_align_minmax(
+	xfs_rtxlen_t		*raminlen,
+	xfs_rtxlen_t		*ramaxlen,
+	xfs_rtxlen_t		*prod)
+{
+	xfs_rtxlen_t		newmaxlen = *ramaxlen;
+	xfs_rtxlen_t		newminlen = *raminlen;
+	xfs_rtxlen_t		slack;
+
+	slack = newmaxlen % *prod;
+	if (slack)
+		newmaxlen -= slack;
+	slack = newminlen % *prod;
+	if (slack)
+		newminlen += *prod - slack;
+
+	/*
+	 * If adjusting for extent size hint alignment produces an invalid
+	 * min/max len combination, go ahead without it.
+	 */
+	if (newmaxlen < newminlen) {
+		*prod = 1;
+		return;
+	}
+	*ramaxlen = newmaxlen;
+	*raminlen = newminlen;
+}
+
+int
+xfs_bmap_rtalloc(
+	struct xfs_bmalloca	*ap)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	xfs_fileoff_t		orig_offset = ap->offset;
+	xfs_rtxnum_t		start;	   /* allocation hint rtextent no */
+	xfs_rtxnum_t		rtx;	   /* actually allocated rtextent no */
+	xfs_rtxlen_t		prod = 0;  /* product factor for allocators */
+	xfs_extlen_t		mod = 0;   /* product factor for allocators */
+	xfs_rtxlen_t		ralen = 0; /* realtime allocation length */
+	xfs_extlen_t		align;     /* minimum allocation alignment */
+	xfs_extlen_t		orig_length = ap->length;
+	xfs_extlen_t		minlen = mp->m_sb.sb_rextsize;
+	xfs_rtxlen_t		raminlen;
+	bool			rtlocked = false;
+	bool			ignore_locality = false;
+	struct xfs_rtalloc_args	args = {
+		.mp		= mp,
+		.tp		= ap->tp,
+	};
+	int			error;
+
+	align = xfs_get_extsz_hint(ap->ip);
+retry:
+	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+					align, 1, ap->eof, 0,
+					ap->conv, &ap->offset, &ap->length);
+	if (error)
+		return error;
+	ASSERT(ap->length);
+	ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0);
+
+	/*
+	 * If we shifted the file offset downward to satisfy an extent size
+	 * hint, increase minlen by that amount so that the allocator won't
+	 * give us an allocation that's too short to cover at least one of the
+	 * blocks that the caller asked for.
+	 */
+	if (ap->offset != orig_offset)
+		minlen += orig_offset - ap->offset;
+
+	/*
+	 * Set ralen to be the actual requested length in rtextents.
+	 *
+	 * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
+	 * we rounded up to it, cut it back so it's valid again.
+	 * Note that if it's a really large request (bigger than
+	 * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
+	 * adjust the starting point to match it.
+	 */
+	ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
+	raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
+	ASSERT(raminlen > 0);
+	ASSERT(raminlen <= ralen);
+
+	/*
+	 * Lock out modifications to both the RT bitmap and summary inodes
+	 */
+	if (!rtlocked) {
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
+		xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+		xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+		rtlocked = true;
+	}
+
+	if (ignore_locality) {
+		start = 0;
+	} else if (xfs_bmap_adjacent(ap)) {
+		start = xfs_rtb_to_rtx(mp, ap->blkno);
+	} else if (ap->eof && ap->offset == 0) {
+		/*
+		 * If it's an allocation to an empty file at offset 0, pick an
+		 * extent that will space things out in the rt area.
+		 */
+		error = xfs_rtpick_extent(mp, ap->tp, ralen, &start);
+		if (error)
+			return error;
+	} else {
+		start = 0;
+	}
+
+	/*
+	 * Only bother calculating a real prod factor if offset & length are
+	 * perfectly aligned, otherwise it will just get us in trouble.
+	 */
+	div_u64_rem(ap->offset, align, &mod);
+	if (mod || ap->length % align) {
+		prod = 1;
+	} else {
+		prod = xfs_extlen_to_rtxlen(mp, align);
+		if (prod > 1)
+			xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod);
+	}
+
+	if (start) {
+		error = xfs_rtallocate_extent_near(&args, start, raminlen,
+				ralen, &ralen, prod, &rtx);
+	} else {
+		error = xfs_rtallocate_extent_size(&args, raminlen,
+				ralen, &ralen, prod, &rtx);
+	}
+	xfs_rtbuf_cache_relse(&args);
+
+	if (error == -ENOSPC) {
+		if (align > mp->m_sb.sb_rextsize) {
+			/*
+			 * We previously enlarged the request length to try to
+			 * satisfy an extent size hint.  The allocator didn't
+			 * return anything, so reset the parameters to the
+			 * original values and try again without alignment
+			 * criteria.
+			 */
+			ap->offset = orig_offset;
+			ap->length = orig_length;
+			minlen = align = mp->m_sb.sb_rextsize;
+			goto retry;
+		}
+
+		if (!ignore_locality && start != 0) {
+			/*
+			 * If we can't allocate near a specific rt extent, try
+			 * again without locality criteria.
+			 */
+			ignore_locality = true;
+			goto retry;
+		}
+
+		ap->blkno = NULLFSBLOCK;
+		ap->length = 0;
+		return 0;
+	}
+	if (error)
+		return error;
+
+	xfs_trans_mod_sb(ap->tp, ap->wasdel ?
+			XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS,
+			-(long)ralen);
+	ap->blkno = xfs_rtx_to_rtb(mp, rtx);
+	ap->length = xfs_rtxlen_to_extlen(mp, ralen);
+	xfs_bmap_alloc_account(ap);
+	return 0;
+}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index f7cb9ffe51ca..a6836da9bebe 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -13,27 +13,6 @@ struct xfs_trans;
 
 #ifdef CONFIG_XFS_RT
 /*
- * Function prototypes for exported functions.
- */
-
-/*
- * Allocate an extent in the realtime subvolume, with the usual allocation
- * parameters.  The length units are all in realtime extents, as is the
- * result block number.
- */
-int					/* error */
-xfs_rtallocate_extent(
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_rtxnum_t		start,	/* starting rtext number to allocate */
-	xfs_rtxlen_t		minlen,	/* minimum length to allocate */
-	xfs_rtxlen_t		maxlen,	/* maximum length to allocate */
-	xfs_rtxlen_t		*len,	/* out: actual length allocated */
-	int			wasdel,	/* was a delayed allocation extent */
-	xfs_rtxlen_t		prod,	/* extent product factor */
-	xfs_rtxnum_t		*rtblock); /* out: start rtext allocated */
-
-
-/*
  * Initialize realtime fields in the mount structure.
  */
 int					/* error */
@@ -52,20 +31,6 @@ xfs_rtmount_inodes(
 	struct xfs_mount	*mp);	/* file system mount structure */
 
 /*
- * Pick an extent for allocation at the start of a new realtime file.
- * Use the sequence number stored in the atime field of the bitmap inode.
- * Translate this to a fraction of the rtextents, and return the product
- * of rtextents and the fraction.
- * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
- */
-int					/* error */
-xfs_rtpick_extent(
-	struct xfs_mount	*mp,	/* file system mount point */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_rtxlen_t		len,	/* allocation length (rtextents) */
-	xfs_rtxnum_t		*pick);	/* result rt extent */
-
-/*
  * Grow the realtime area of the filesystem.
  */
 int
@@ -75,8 +40,6 @@ xfs_growfs_rt(
 
 int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
 #else
-# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)	(-ENOSYS)
-# define xfs_rtpick_extent(m,t,l,rb)			(-ENOSYS)
 # define xfs_growfs_rt(mp,in)				(-ENOSYS)
 # define xfs_rtalloc_reinit_frextents(m)		(0)
 static inline int		/* error */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 07857d967ee8..aff20ddd4a9f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -894,10 +894,8 @@ xfs_fs_statfs(
 STATIC void
 xfs_save_resvblks(struct xfs_mount *mp)
 {
-	uint64_t resblks = 0;
-
 	mp->m_resblks_save = mp->m_resblks;
-	xfs_reserve_blocks(mp, &resblks, NULL);
+	xfs_reserve_blocks(mp, 0);
 }
 
 STATIC void
@@ -911,7 +909,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
 	} else
 		resblks = xfs_default_resblks(mp);
 
-	xfs_reserve_blocks(mp, &resblks, NULL);
+	xfs_reserve_blocks(mp, resblks);
 }
 
 /*
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 85e433df6a3f..92974a4414c8 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -23,6 +23,7 @@
 #include "xfs_trans.h"
 #include "xfs_ialloc.h"
 #include "xfs_error.h"
+#include "xfs_health.h"
 
 /* ----- Kernel only functions below ----- */
 int
@@ -108,6 +109,8 @@ xfs_readlink(
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
+	if (xfs_ifork_zapped(ip, XFS_DATA_FORK))
+		return -EIO;
 
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 
@@ -128,10 +131,10 @@ xfs_readlink(
 		 * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED
 		 * if if_data is junk.
 		 */
-		if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data))
+		if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data))
 			goto out;
 
-		memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1);
+		memcpy(link, ip->i_df.if_data, pathlen + 1);
 		error = 0;
 	} else {
 		error = xfs_readlink_bmap_ilocked(ip, link);
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index f78ad6b10ea5..276696a07040 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,8 @@ struct xfs_globals {
 	int	pwork_threads;		/* parallel workqueue threads */
 	bool	larp;			/* log attribute replay */
 #endif
+	int	bload_leaf_slack;	/* btree bulk load leaf slack */
+	int	bload_node_slack;	/* btree bulk load node slack */
 	int	log_recovery_delay;	/* log recovery delay (secs) */
 	int	mount_delay;		/* mount setup delay (secs) */
 	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index a3c6b1548723..17485666b672 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -229,6 +229,15 @@ pwork_threads_show(
 }
 XFS_SYSFS_ATTR_RW(pwork_threads);
 
+/*
+ * The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob
+ * sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on
+ * V5 filesystems.  As a result, the intermediate progress of all setxattr and
+ * removexattr operations are tracked via the log and can be restarted during
+ * recovery.  This is useful for testing xattr recovery prior to merging of the
+ * parent pointer feature which requires it to maintain consistency, and may be
+ * enabled for userspace xattrs in the future.
+ */
 static ssize_t
 larp_store(
 	struct kobject	*kobject,
@@ -253,6 +262,58 @@ larp_show(
 XFS_SYSFS_ATTR_RW(larp);
 #endif /* DEBUG */
 
+STATIC ssize_t
+bload_leaf_slack_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	xfs_globals.bload_leaf_slack = val;
+	return count;
+}
+
+STATIC ssize_t
+bload_leaf_slack_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_leaf_slack);
+
+STATIC ssize_t
+bload_node_slack_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	xfs_globals.bload_node_slack = val;
+	return count;
+}
+
+STATIC ssize_t
+bload_node_slack_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_node_slack);
+
 static struct attribute *xfs_dbg_attrs[] = {
 	ATTR_LIST(bug_on_assert),
 	ATTR_LIST(log_recovery_delay),
@@ -262,6 +323,8 @@ static struct attribute *xfs_dbg_attrs[] = {
 	ATTR_LIST(pwork_threads),
 	ATTR_LIST(larp),
 #endif
+	ATTR_LIST(bload_leaf_slack),
+	ATTR_LIST(bload_node_slack),
 	NULL,
 };
 ATTRIBUTE_GROUPS(xfs_dbg);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 3926cf7f2a6e..0984a1c884c7 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -67,6 +67,7 @@ struct xfs_buf_log_format;
 struct xfs_inode_log_format;
 struct xfs_bmbt_irec;
 struct xfs_btree_cur;
+struct xfs_defer_op_type;
 struct xfs_refcount_irec;
 struct xfs_fsmap;
 struct xfs_rmap_irec;
@@ -145,21 +146,23 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
 
 TRACE_EVENT(xlog_intent_recovery_failed,
-	TP_PROTO(struct xfs_mount *mp, int error, void *function),
-	TP_ARGS(mp, error, function),
+	TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
+		 int error),
+	TP_ARGS(mp, ops, error),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__string(name, ops->name)
 		__field(int, error)
-		__field(void *, function)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
+		__assign_str(name, ops->name);
 		__entry->error = error;
-		__entry->function = function;
 	),
-	TP_printk("dev %d:%d error %d function %pS",
+	TP_printk("dev %d:%d optype %s error %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->error, __entry->function)
+		  __get_str(name),
+		  __entry->error)
 );
 
 DECLARE_EVENT_CLASS(xfs_perag_class,
@@ -2549,22 +2552,25 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
 	TP_ARGS(mp, dfp),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
-		__field(int, type)
+		__string(name, dfp->dfp_ops->name)
 		__field(void *, intent)
+		__field(unsigned int, flags)
 		__field(char, committed)
 		__field(int, nr)
 	),
 	TP_fast_assign(
 		__entry->dev = mp ? mp->m_super->s_dev : 0;
-		__entry->type = dfp->dfp_type;
+		__assign_str(name, dfp->dfp_ops->name);
 		__entry->intent = dfp->dfp_intent;
+		__entry->flags = dfp->dfp_flags;
 		__entry->committed = dfp->dfp_done != NULL;
 		__entry->nr = dfp->dfp_count;
 	),
-	TP_printk("dev %d:%d optype %d intent %p committed %d nr %d",
+	TP_printk("dev %d:%d optype %s intent %p flags %s committed %d nr %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
+		  __get_str(name),
 		  __entry->intent,
+		  __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS),
 		  __entry->committed,
 		  __entry->nr)
 )
@@ -2675,6 +2681,9 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
 
 #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
 DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
@@ -2688,25 +2697,28 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
 	TP_ARGS(mp, dfp, item),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
-		__field(int, type)
+		__string(name, dfp->dfp_ops->name)
 		__field(void *, intent)
 		__field(void *, item)
 		__field(char, committed)
+		__field(unsigned int, flags)
 		__field(int, nr)
 	),
 	TP_fast_assign(
 		__entry->dev = mp ? mp->m_super->s_dev : 0;
-		__entry->type = dfp->dfp_type;
+		__assign_str(name, dfp->dfp_ops->name);
 		__entry->intent = dfp->dfp_intent;
 		__entry->item = item;
 		__entry->committed = dfp->dfp_done != NULL;
+		__entry->flags = dfp->dfp_flags;
 		__entry->nr = dfp->dfp_count;
 	),
-	TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d",
+	TP_printk("dev %d:%d optype %s intent %p item %p flags %s committed %d nr %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->type,
+		  __get_str(name),
 		  __entry->intent,
 		  __entry->item,
+		  __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS),
 		  __entry->committed,
 		  __entry->nr)
 )
@@ -4399,8 +4411,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
 DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
 DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
 DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
-DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
-DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
 
 
 TRACE_EVENT(xfs_force_shutdown,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 305c9d07bf1b..12d45e93f07d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1237,6 +1237,68 @@ out_cancel:
 }
 
 /*
+ * Try to reserve more blocks for a transaction.
+ *
+ * This is for callers that need to attach resources to a transaction, scan
+ * those resources to determine the space reservation requirements, and then
+ * modify the attached resources.  In other words, online repair.  This can
+ * fail due to ENOSPC, so the caller must be able to cancel the transaction
+ * without shutting down the fs.
+ */
+int
+xfs_trans_reserve_more(
+	struct xfs_trans	*tp,
+	unsigned int		blocks,
+	unsigned int		rtextents)
+{
+	struct xfs_trans_res	resv = { };
+
+	return xfs_trans_reserve(tp, &resv, blocks, rtextents);
+}
+
+/*
+ * Try to reserve more blocks and file quota for a transaction.  Same
+ * conditions of usage as xfs_trans_reserve_more.
+ */
+int
+xfs_trans_reserve_more_inode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	unsigned int		dblocks,
+	unsigned int		rblocks,
+	bool			force_quota)
+{
+	struct xfs_trans_res	resv = { };
+	struct xfs_mount	*mp = ip->i_mount;
+	unsigned int		rtx = xfs_extlen_to_rtxlen(mp, rblocks);
+	int			error;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	error = xfs_trans_reserve(tp, &resv, dblocks, rtx);
+	if (error)
+		return error;
+
+	if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
+		return 0;
+
+	if (tp->t_flags & XFS_TRANS_RESERVE)
+		force_quota = true;
+
+	error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
+			force_quota);
+	if (!error)
+		return 0;
+
+	/* Quota failed, give back the new reservation. */
+	xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE);
+	tp->t_blk_res -= dblocks;
+	xfs_mod_frextents(mp, rtx);
+	tp->t_rtx_res -= rtx;
+	return error;
+}
+
+/*
  * Allocate an transaction in preparation for inode creation by reserving quota
  * against the given dquots.  Callers are not required to hold any inode locks.
  */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 6e3646d524ce..08ce757c7454 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -78,11 +78,7 @@ struct xfs_item_ops {
 	xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
 	uint (*iop_push)(struct xfs_log_item *, struct list_head *);
 	void (*iop_release)(struct xfs_log_item *);
-	int (*iop_recover)(struct xfs_log_item *lip,
-			   struct list_head *capture_list);
 	bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
-	struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
-			struct xfs_trans *tp);
 	struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
 };
 
@@ -168,6 +164,8 @@ typedef struct xfs_trans {
 int		xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
 			uint blocks, uint rtextents, uint flags,
 			struct xfs_trans **tpp);
+int		xfs_trans_reserve_more(struct xfs_trans *tp,
+			unsigned int blocks, unsigned int rtextents);
 int		xfs_trans_alloc_empty(struct xfs_mount *mp,
 			struct xfs_trans **tpp);
 void		xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -247,19 +245,13 @@ void		xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
 
 extern struct kmem_cache	*xfs_trans_cache;
 
-static inline struct xfs_log_item *
-xfs_trans_item_relog(
-	struct xfs_log_item	*lip,
-	struct xfs_trans	*tp)
-{
-	return lip->li_ops->iop_relog(lip, tp);
-}
-
 struct xfs_dquot;
 
 int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
 		unsigned int dblocks, unsigned int rblocks, bool force,
 		struct xfs_trans **tpp);
+int xfs_trans_reserve_more_inode(struct xfs_trans *tp, struct xfs_inode *ip,
+		unsigned int dblocks, unsigned int rblocks, bool force_quota);
 int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
 		struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
 		struct xfs_dquot *pdqp, unsigned int dblocks,
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 987843f84d03..364104e1b38a 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -136,6 +136,9 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
 	};
 	int			error;
 
+	if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK))
+		return -EIO;
+
 	error = xfs_attr_get(&args);
 	if (error)
 		return error;
@@ -294,6 +297,9 @@ xfs_vn_listxattr(
 	struct inode	*inode = d_inode(dentry);
 	int		error;
 
+	if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK))
+		return -EIO;
+
 	/*
 	 * First read the regular on-disk attributes.
 	 */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 896c0079f64f..f5a97dec5169 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3942,6 +3942,7 @@ enum mf_flags {
 	MF_UNPOISON = 1 << 4,
 	MF_SW_SIMULATED = 1 << 5,
 	MF_NO_RETRY = 1 << 6,
+	MF_MEM_PRE_REMOVE = 1 << 7,
 };
 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		      unsigned long count, int mf_flags);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a0d9b4ac7d54..4f9b61f4a668 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -678,7 +678,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
  */
 static void collect_procs_fsdax(struct page *page,
 		struct address_space *mapping, pgoff_t pgoff,
-		struct list_head *to_kill)
+		struct list_head *to_kill, bool pre_remove)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
@@ -686,8 +686,15 @@ static void collect_procs_fsdax(struct page *page,
 	i_mmap_lock_read(mapping);
 	rcu_read_lock();
 	for_each_process(tsk) {
-		struct task_struct *t = task_early_kill(tsk, true);
+		struct task_struct *t = tsk;
 
+		/*
+		 * Search for all tasks while MF_MEM_PRE_REMOVE is set, because
+		 * the current may not be the one accessing the fsdax page.
+		 * Otherwise, search for the current task.
+		 */
+		if (!pre_remove)
+			t = task_early_kill(tsk, true);
 		if (!t)
 			continue;
 		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 	dax_entry_t cookie;
 	struct page *page;
 	size_t end = index + count;
+	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
 
 	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
 
@@ -1804,9 +1812,14 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		if (!page)
 			goto unlock;
 
-		SetPageHWPoison(page);
+		if (!pre_remove)
+			SetPageHWPoison(page);
 
-		collect_procs_fsdax(page, mapping, index, &to_kill);
+		/*
+		 * The pre_remove case is revoking access, the memory is still
+		 * good and could theoretically be put back into service.
+		 */
+		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
 		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
 				index, mf_flags);
 unlock: