24 files changed, 770 insertions, 135 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index fc593c869493..584e87e11cb6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -52,6 +52,7 @@ xfs-y				+= $(addprefix libxfs/, \
 				   xfs_inode_fork.o \
 				   xfs_inode_buf.o \
 				   xfs_log_rlimit.o \
+				   xfs_ag_resv.o \
 				   xfs_rmap.o \
 				   xfs_rmap_btree.o \
 				   xfs_sb.o \
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
new file mode 100644
index 000000000000..e3ae0f2b4294
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_btree.h"
+
+/*
+ * Per-AG Block Reservations
+ *
+ * For some kinds of allocation group metadata structures, it is advantageous
+ * to reserve a small number of blocks in each AG so that future expansions of
+ * that data structure do not encounter ENOSPC because errors during a btree
+ * split cause the filesystem to go offline.
+ *
+ * Prior to the introduction of reflink, this wasn't an issue because the free
+ * space btrees maintain a reserve of space (the AGFL) to handle any expansion
+ * that may be necessary; and allocations of other metadata (inodes, BMBT,
+ * dir/attr) aren't restricted to a single AG.  However, with reflink it is
+ * possible to allocate all the space in an AG, have subsequent reflink/CoW
+ * activity expand the refcount btree, and discover that there's no space left
+ * to handle that expansion.  Since we can calculate the maximum size of the
+ * refcount btree, we can reserve space for it and avoid ENOSPC.
+ *
+ * Handling per-AG reservations consists of three changes to the allocator's
+ * behavior:  First, because these reservations are always needed, we decrease
+ * the ag_max_usable counter to reflect the size of the AG after the reserved
+ * blocks are taken.  Second, the reservations must be reflected in the
+ * fdblocks count to maintain proper accounting.  Third, each AG must maintain
+ * its own reserved block counter so that we can calculate the amount of space
+ * that must remain free to maintain the reservations.  Fourth, the "remaining
+ * reserved blocks" count must be used when calculating the length of the
+ * longest free extent in an AG and to clamp maxlen in the per-AG allocation
+ * functions.  In other words, we maintain a virtual allocation via in-core
+ * accounting tricks so that we don't have to clean up after a crash. :)
+ *
+ * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
+ * values via struct xfs_alloc_arg or directly to the xfs_free_extent
+ * function.  It might seem a little funny to maintain a reservoir of blocks
+ * to feed another reservoir, but the AGFL only holds enough blocks to get
+ * through the next transaction.  The per-AG reservation is to ensure (we
+ * hope) that each AG never runs out of blocks.  Each data structure wanting
+ * to use the reservation system should update ask/used in xfs_ag_resv_init.
+ */
+
+/*
+ * Are we critically low on blocks?  For now we'll define that as the number
+ * of blocks we can get our hands on being less than 10% of what we reserved
+ * or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_ag_resv_critical(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type)
+{
+	xfs_extlen_t			avail;
+	xfs_extlen_t			orig;
+
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+		avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved;
+		orig = pag->pag_meta_resv.ar_asked;
+		break;
+	case XFS_AG_RESV_AGFL:
+		avail = pag->pagf_freeblks + pag->pagf_flcount -
+			pag->pag_meta_resv.ar_reserved;
+		orig = pag->pag_agfl_resv.ar_asked;
+		break;
+	default:
+		ASSERT(0);
+		return false;
+	}
+
+	trace_xfs_ag_resv_critical(pag, type, avail);
+
+	/* Critically low if less than 10% or max btree height remains. */
+	return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS;
+}
+
+/*
+ * How many blocks are reserved but not used, and therefore must not be
+ * allocated away?
+ */
+xfs_extlen_t
+xfs_ag_resv_needed(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type)
+{
+	xfs_extlen_t			len;
+
+	len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved;
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+	case XFS_AG_RESV_AGFL:
+		len -= xfs_perag_resv(pag, type)->ar_reserved;
+		break;
+	case XFS_AG_RESV_NONE:
+		/* empty */
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	trace_xfs_ag_resv_needed(pag, type, len);
+
+	return len;
+}
+
+/* Clean out a reservation */
+static int
+__xfs_ag_resv_free(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type)
+{
+	struct xfs_ag_resv		*resv;
+	xfs_extlen_t			oldresv;
+	int				error;
+
+	trace_xfs_ag_resv_free(pag, type, 0);
+
+	resv = xfs_perag_resv(pag, type);
+	pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+	/*
+	 * AGFL blocks are always considered "free", so whatever
+	 * was reserved at mount time must be given back at umount.
+	 */
+	if (type == XFS_AG_RESV_AGFL)
+		oldresv = resv->ar_orig_reserved;
+	else
+		oldresv = resv->ar_reserved;
+	error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+	resv->ar_reserved = 0;
+	resv->ar_asked = 0;
+
+	if (error)
+		trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
+				error, _RET_IP_);
+	return error;
+}
+
+/* Free a per-AG reservation. */
+int
+xfs_ag_resv_free(
+	struct xfs_perag		*pag)
+{
+	int				error;
+	int				err2;
+
+	error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL);
+	err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
+	if (err2 && !error)
+		error = err2;
+	return error;
+}
+
+static int
+__xfs_ag_resv_init(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type,
+	xfs_extlen_t			ask,
+	xfs_extlen_t			used)
+{
+	struct xfs_mount		*mp = pag->pag_mount;
+	struct xfs_ag_resv		*resv;
+	int				error;
+
+	resv = xfs_perag_resv(pag, type);
+	if (used > ask)
+		ask = used;
+	resv->ar_asked = ask;
+	resv->ar_reserved = resv->ar_orig_reserved = ask - used;
+	mp->m_ag_max_usable -= ask;
+
+	trace_xfs_ag_resv_init(pag, type, ask);
+
+	error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
+	if (error)
+		trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
+				error, _RET_IP_);
+
+	return error;
+}
+
+/* Create a per-AG block reservation. */
+int
+xfs_ag_resv_init(
+	struct xfs_perag		*pag)
+{
+	xfs_extlen_t			ask;
+	xfs_extlen_t			used;
+	int				error = 0;
+
+	/* Create the metadata reservation. */
+	if (pag->pag_meta_resv.ar_asked == 0) {
+		ask = used = 0;
+
+		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+				ask, used);
+		if (error)
+			goto out;
+	}
+
+	/* Create the AGFL metadata reservation */
+	if (pag->pag_agfl_resv.ar_asked == 0) {
+		ask = used = 0;
+
+		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
+		if (error)
+			goto out;
+	}
+
+out:
+	return error;
+}
+
+/* Allocate a block from the reservation. */
+void
+xfs_ag_resv_alloc_extent(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type,
+	struct xfs_alloc_arg		*args)
+{
+	struct xfs_ag_resv		*resv;
+	xfs_extlen_t			len;
+	uint				field;
+
+	trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
+
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+	case XFS_AG_RESV_AGFL:
+		resv = xfs_perag_resv(pag, type);
+		break;
+	default:
+		ASSERT(0);
+		/* fall through */
+	case XFS_AG_RESV_NONE:
+		field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+				       XFS_TRANS_SB_FDBLOCKS;
+		xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
+		return;
+	}
+
+	len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
+	resv->ar_reserved -= len;
+	if (type == XFS_AG_RESV_AGFL)
+		return;
+	/* Allocations of reserved blocks only need on-disk sb updates... */
+	xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
+	/* ...but non-reserved blocks need in-core and on-disk updates. */
+	if (args->len > len)
+		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
+				-((int64_t)args->len - len));
+}
+
+/* Free a block to the reservation. */
+void
+xfs_ag_resv_free_extent(
+	struct xfs_perag		*pag,
+	enum xfs_ag_resv_type		type,
+	struct xfs_trans		*tp,
+	xfs_extlen_t			len)
+{
+	xfs_extlen_t			leftover;
+	struct xfs_ag_resv		*resv;
+
+	trace_xfs_ag_resv_free_extent(pag, type, len);
+
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+	case XFS_AG_RESV_AGFL:
+		resv = xfs_perag_resv(pag, type);
+		break;
+	default:
+		ASSERT(0);
+		/* fall through */
+	case XFS_AG_RESV_NONE:
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+		return;
+	}
+
+	leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
+	resv->ar_reserved += leftover;
+	if (type == XFS_AG_RESV_AGFL)
+		return;
+	/* Freeing into the reserved pool only requires on-disk update... */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
+	/* ...but freeing beyond that requires in-core and on-disk update. */
+	if (len > leftover)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
+}
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
new file mode 100644
index 000000000000..8d6c687deef3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_AG_RESV_H__
+#define	__XFS_AG_RESV_H__
+
+int xfs_ag_resv_free(struct xfs_perag *pag);
+int xfs_ag_resv_init(struct xfs_perag *pag);
+
+bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag,
+		enum xfs_ag_resv_type type);
+
+void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+		struct xfs_alloc_arg *args);
+void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+		struct xfs_trans *tp, xfs_extlen_t len);
+
+#endif	/* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 05b5243d89f6..2620a86a756a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -37,6 +37,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
+#include "xfs_ag_resv.h"
 
 struct workqueue_struct *xfs_alloc_wq;
 
@@ -74,14 +75,8 @@ xfs_prealloc_blocks(
  * extents need to be actually allocated. To get around this, we explicitly set
  * aside a few blocks which will not be reserved in delayed allocation.
  *
- * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist
- * and 4 more to handle a potential split of the file's bmap btree.
- *
- * When rmap is enabled, we must also be able to handle two rmap btree inserts
- * to record both the file data extent and a new bmbt block.  The bmbt block
- * might not be in the same AG as the file data extent.  In the worst case
- * the bmap btree splits multiple levels and all the new blocks come from
- * different AGs, so set aside enough to handle rmap btree splits in all AGs.
+ * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
+ * potential split of the file's bmap btree.
  */
 unsigned int
 xfs_alloc_set_aside(
@@ -90,8 +85,6 @@ xfs_alloc_set_aside(
 	unsigned int		blocks;
 
 	blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
-	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
-		blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
 	return blocks;
 }
 
@@ -680,12 +673,29 @@ xfs_alloc_ag_vextent(
 	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
 {
 	int		error=0;
+	xfs_extlen_t	reservation;
+	xfs_extlen_t	oldmax;
 
 	ASSERT(args->minlen > 0);
 	ASSERT(args->maxlen > 0);
 	ASSERT(args->minlen <= args->maxlen);
 	ASSERT(args->mod < args->prod);
 	ASSERT(args->alignment > 0);
+
+	/*
+	 * Clamp maxlen to the amount of free space minus any reservations
+	 * that have been made.
+	 */
+	oldmax = args->maxlen;
+	reservation = xfs_ag_resv_needed(args->pag, args->resv);
+	if (args->maxlen > args->pag->pagf_freeblks - reservation)
+		args->maxlen = args->pag->pagf_freeblks - reservation;
+	if (args->maxlen == 0) {
+		args->agbno = NULLAGBLOCK;
+		args->maxlen = oldmax;
+		return 0;
+	}
+
 	/*
 	 * Branch to correct routine based on the type.
 	 */
@@ -705,12 +715,14 @@ xfs_alloc_ag_vextent(
 		/* NOTREACHED */
 	}
 
+	args->maxlen = oldmax;
+
 	if (error || args->agbno == NULLAGBLOCK)
 		return error;
 
 	ASSERT(args->len >= args->minlen);
 	ASSERT(args->len <= args->maxlen);
-	ASSERT(!args->wasfromfl || !args->isfl);
+	ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
 	ASSERT(args->agbno % args->alignment == 0);
 
 	/* if not file data, insert new block into the reverse map btree */
@@ -732,12 +744,7 @@ xfs_alloc_ag_vextent(
 					      args->agbno, args->len));
 	}
 
-	if (!args->isfl) {
-		xfs_trans_mod_sb(args->tp, args->wasdel ?
-				 XFS_TRANS_SB_RES_FDBLOCKS :
-				 XFS_TRANS_SB_FDBLOCKS,
-				 -((long)(args->len)));
-	}
+	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
 
 	XFS_STATS_INC(args->mp, xs_allocx);
 	XFS_STATS_ADD(args->mp, xs_allocb, args->len);
@@ -1583,6 +1590,7 @@ xfs_alloc_ag_vextent_small(
 	int		*stat)	/* status: 0-freelist, 1-normal/none */
 {
 	struct xfs_owner_info	oinfo;
+	struct xfs_perag	*pag;
 	int		error;
 	xfs_agblock_t	fbno;
 	xfs_extlen_t	flen;
@@ -1600,7 +1608,8 @@ xfs_alloc_ag_vextent_small(
 	 * to respect minleft even when pulling from the
 	 * freelist.
 	 */
-	else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+	else if (args->minlen == 1 && args->alignment == 1 &&
+		 args->resv != XFS_AG_RESV_AGFL &&
 		 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
 		  > args->minleft)) {
 		error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
@@ -1629,13 +1638,18 @@ xfs_alloc_ag_vextent_small(
 			/*
 			 * If we're feeding an AGFL block to something that
 			 * doesn't live in the free space, we need to clear
-			 * out the OWN_AG rmap.
+			 * out the OWN_AG rmap and add the block back to
+			 * the AGFL per-AG reservation.
 			 */
 			xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
 			error = xfs_rmap_free(args->tp, args->agbp, args->agno,
 					fbno, 1, &oinfo);
 			if (error)
 				goto error0;
+			pag = xfs_perag_get(args->mp, args->agno);
+			xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL,
+					args->tp, 1);
+			xfs_perag_put(pag);
 
 			*stat = 0;
 			return 0;
@@ -1683,7 +1697,7 @@ xfs_free_ag_extent(
 	xfs_agblock_t		bno,
 	xfs_extlen_t		len,
 	struct xfs_owner_info	*oinfo,
-	int			isfl)
+	enum xfs_ag_resv_type	type)
 {
 	xfs_btree_cur_t	*bno_cur;	/* cursor for by-block btree */
 	xfs_btree_cur_t	*cnt_cur;	/* cursor for by-size btree */
@@ -1911,21 +1925,22 @@ xfs_free_ag_extent(
 	 */
 	pag = xfs_perag_get(mp, agno);
 	error = xfs_alloc_update_counters(tp, pag, agbp, len);
+	xfs_ag_resv_free_extent(pag, type, tp, len);
 	xfs_perag_put(pag);
 	if (error)
 		goto error0;
 
-	if (!isfl)
-		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
 	XFS_STATS_INC(mp, xs_freex);
 	XFS_STATS_ADD(mp, xs_freeb, len);
 
-	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+	trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+			haveleft, haveright);
 
 	return 0;
 
  error0:
-	trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+	trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+			-1, -1);
 	if (bno_cur)
 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
 	if (cnt_cur)
@@ -1950,21 +1965,43 @@ xfs_alloc_compute_maxlevels(
 }
 
 /*
- * Find the length of the longest extent in an AG.
+ * Find the length of the longest extent in an AG.  The 'need' parameter
+ * specifies how much space we're going to need for the AGFL and the
+ * 'reserved' parameter tells us how many blocks in this AG are reserved for
+ * other callers.
  */
 xfs_extlen_t
 xfs_alloc_longest_free_extent(
 	struct xfs_mount	*mp,
 	struct xfs_perag	*pag,
-	xfs_extlen_t		need)
+	xfs_extlen_t		need,
+	xfs_extlen_t		reserved)
 {
 	xfs_extlen_t		delta = 0;
 
+	/*
+	 * If the AGFL needs a recharge, we'll have to subtract that from the
+	 * longest extent.
+	 */
 	if (need > pag->pagf_flcount)
 		delta = need - pag->pagf_flcount;
 
+	/*
+	 * If we cannot maintain others' reservations with space from the
+	 * not-longest freesp extents, we'll have to subtract /that/ from
+	 * the longest extent too.
+	 */
+	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
+		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
+
+	/*
+	 * If the longest extent is long enough to satisfy all the
+	 * reservations and AGFL rules in place, we can return this extent.
+	 */
 	if (pag->pagf_longest > delta)
 		return pag->pagf_longest - delta;
+
+	/* Otherwise, let the caller try for 1 block if there's space. */
 	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
 }
 
@@ -2004,20 +2041,24 @@ xfs_alloc_space_available(
 {
 	struct xfs_perag	*pag = args->pag;
 	xfs_extlen_t		longest;
+	xfs_extlen_t		reservation; /* blocks that are still reserved */
 	int			available;
 
 	if (flags & XFS_ALLOC_FLAG_FREEING)
 		return true;
 
+	reservation = xfs_ag_resv_needed(pag, args->resv);
+
 	/* do we have enough contiguous free space for the allocation? */
-	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
+			reservation);
 	if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
 		return false;
 
-	/* do have enough free space remaining for the allocation? */
+	/* do we have enough free space remaining for the allocation? */
 	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
-			  min_free - args->total);
-	if (available < (int)args->minleft)
+			  reservation - min_free - args->total);
+	if (available < (int)args->minleft || available <= 0)
 		return false;
 
 	return true;
@@ -2124,7 +2165,7 @@ xfs_alloc_fix_freelist(
 		if (error)
 			goto out_agbp_relse;
 		error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
-					   &targs.oinfo, 1);
+					   &targs.oinfo, XFS_AG_RESV_AGFL);
 		if (error)
 			goto out_agbp_relse;
 		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
@@ -2135,7 +2176,7 @@ xfs_alloc_fix_freelist(
 	targs.mp = mp;
 	targs.agbp = agbp;
 	targs.agno = args->agno;
-	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+	targs.alignment = targs.minlen = targs.prod = 1;
 	targs.type = XFS_ALLOCTYPE_THIS_AG;
 	targs.pag = pag;
 	error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
@@ -2146,6 +2187,7 @@ xfs_alloc_fix_freelist(
 	while (pag->pagf_flcount < need) {
 		targs.agbno = 0;
 		targs.maxlen = need - pag->pagf_flcount;
+		targs.resv = XFS_AG_RESV_AGFL;
 
 		/* Allocate as many blocks as possible at once. */
 		error = xfs_alloc_ag_vextent(&targs);
@@ -2825,7 +2867,8 @@ xfs_free_extent(
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_fsblock_t		bno,	/* starting block number of extent */
 	xfs_extlen_t		len,	/* length of extent */
-	struct xfs_owner_info	*oinfo)	/* extent owner */
+	struct xfs_owner_info	*oinfo,	/* extent owner */
+	enum xfs_ag_resv_type	type)	/* block reservation type */
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_buf		*agbp;
@@ -2834,6 +2877,7 @@ xfs_free_extent(
 	int			error;
 
 	ASSERT(len != 0);
+	ASSERT(type != XFS_AG_RESV_AGFL);
 
 	if (XFS_TEST_ERROR(false, mp,
 			XFS_ERRTAG_FREE_EXTENT,
@@ -2851,7 +2895,7 @@ xfs_free_extent(
 		agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
 				err);
 
-	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0);
+	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
 	if (error)
 		goto err;
 
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6fe2d6b7cfe9..f7c520193239 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -87,10 +87,10 @@ typedef struct xfs_alloc_arg {
 	xfs_alloctype_t	otype;		/* original allocation type */
 	char		wasdel;		/* set if allocation was prev delayed */
 	char		wasfromfl;	/* set if allocation is from freelist */
-	char		isfl;		/* set if is freelist blocks - !acctg */
 	char		userdata;	/* mask defining userdata treatment */
 	xfs_fsblock_t	firstblock;	/* io first block allocated */
 	struct xfs_owner_info	oinfo;	/* owner of blocks being allocated */
+	enum xfs_ag_resv_type	resv;	/* block reservation to use */
 } xfs_alloc_arg_t;
 
 /*
@@ -106,7 +106,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
 unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
 
 xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
-		struct xfs_perag *pag, xfs_extlen_t need);
+		struct xfs_perag *pag, xfs_extlen_t need,
+		xfs_extlen_t reserved);
 unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
 		struct xfs_perag *pag);
 
@@ -184,7 +185,8 @@ xfs_free_extent(
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_fsblock_t		bno,	/* starting block number of extent */
 	xfs_extlen_t		len,	/* length of extent */
-	struct xfs_owner_info	*oinfo);/* extent owner */
+	struct xfs_owner_info	*oinfo,	/* extent owner */
+	enum xfs_ag_resv_type	type);	/* block reservation type */
 
 int				/* error */
 xfs_alloc_lookup_ge(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 614803bc8a9f..6fd458674e56 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -47,6 +47,7 @@
 #include "xfs_attr_leaf.h"
 #include "xfs_filestream.h"
 #include "xfs_rmap.h"
+#include "xfs_ag_resv.h"
 
 
 kmem_zone_t		*xfs_bmap_free_item_zone;
@@ -3501,7 +3502,8 @@ xfs_bmap_longest_free_extent(
 	}
 
 	longest = xfs_alloc_longest_free_extent(mp, pag,
-					xfs_alloc_min_freelist(mp, pag));
+				xfs_alloc_min_freelist(mp, pag),
+				xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
 	if (*blen < longest)
 		*blen = longest;
 
@@ -3781,7 +3783,7 @@ xfs_bmap_btalloc(
 	}
 	args.minleft = ap->minleft;
 	args.wasdel = ap->wasdel;
-	args.isfl = 0;
+	args.resv = XFS_AG_RESV_NONE;
 	args.userdata = ap->userdata;
 	if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
 		args.ip = ap->ip;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 08569792fe20..aa1752f918b8 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2070,7 +2070,7 @@ __xfs_btree_updkeys(
 	struct xfs_buf		*bp0,
 	bool			force_all)
 {
-	union xfs_btree_bigkey	key;	/* keys from current level */
+	union xfs_btree_key	key;	/* keys from current level */
 	union xfs_btree_key	*lkey;	/* keys from the next level up */
 	union xfs_btree_key	*hkey;
 	union xfs_btree_key	*nlkey;	/* keys from the next level up */
@@ -2086,7 +2086,7 @@ __xfs_btree_updkeys(
 
 	trace_xfs_btree_updkeys(cur, level, bp0);
 
-	lkey = (union xfs_btree_key *)&key;
+	lkey = &key;
 	hkey = xfs_btree_high_key_from_key(cur, lkey);
 	xfs_btree_get_keys(cur, block, lkey);
 	for (level++; level < cur->bc_nlevels; level++) {
@@ -3226,7 +3226,7 @@ xfs_btree_insrec(
 	struct xfs_buf		*bp;	/* buffer for block */
 	union xfs_btree_ptr	nptr;	/* new block ptr */
 	struct xfs_btree_cur	*ncur;	/* new btree cursor */
-	union xfs_btree_bigkey	nkey;	/* new block key */
+	union xfs_btree_key	nkey;	/* new block key */
 	union xfs_btree_key	*lkey;
 	int			optr;	/* old key/record index */
 	int			ptr;	/* key/record index */
@@ -3241,7 +3241,7 @@ xfs_btree_insrec(
 	XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
 
 	ncur = NULL;
-	lkey = (union xfs_btree_key *)&nkey;
+	lkey = &nkey;
 
 	/*
 	 * If we have an external root pointer, and we've made it to the
@@ -3444,14 +3444,14 @@ xfs_btree_insert(
 	union xfs_btree_ptr	nptr;	/* new block number (split result) */
 	struct xfs_btree_cur	*ncur;	/* new cursor (split result) */
 	struct xfs_btree_cur	*pcur;	/* previous level's cursor */
-	union xfs_btree_bigkey	bkey;	/* key of block to insert */
+	union xfs_btree_key	bkey;	/* key of block to insert */
 	union xfs_btree_key	*key;
 	union xfs_btree_rec	rec;	/* record to insert */
 
 	level = 0;
 	ncur = NULL;
 	pcur = cur;
-	key = (union xfs_btree_key *)&bkey;
+	key = &bkey;
 
 	xfs_btree_set_ptr_null(cur, &nptr);
 
@@ -4797,3 +4797,50 @@ xfs_btree_query_range(
 	return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
 			fn, priv);
 }
+
+/*
+ * Calculate the number of blocks needed to store a given number of records
+ * in a short-format (per-AG metadata) btree.
+ */
+xfs_extlen_t
+xfs_btree_calc_size(
+	struct xfs_mount	*mp,
+	uint			*limits,
+	unsigned long long	len)
+{
+	int			level;
+	int			maxrecs;
+	xfs_extlen_t		rval;
+
+	maxrecs = limits[0];
+	for (level = 0, rval = 0; len > 1; level++) {
+		len += maxrecs - 1;
+		do_div(len, maxrecs);
+		maxrecs = limits[1];
+		rval += len;
+	}
+	return rval;
+}
+
+int
+xfs_btree_count_blocks_helper(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	void			*data)
+{
+	xfs_extlen_t		*blocks = data;
+	(*blocks)++;
+
+	return 0;
+}
+
+/* Count the blocks in a btree and return the result in *blocks. */
+int
+xfs_btree_count_blocks(
+	struct xfs_btree_cur	*cur,
+	xfs_extlen_t		*blocks)
+{
+	*blocks = 0;
+	return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
+			blocks);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 04d0865e5e6d..3f8556a5c2ad 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -37,30 +37,18 @@ union xfs_btree_ptr {
 	__be64			l;	/* long form ptr */
 };
 
-union xfs_btree_key {
-	struct xfs_bmbt_key		bmbt;
-	xfs_bmdr_key_t			bmbr;	/* bmbt root block */
-	xfs_alloc_key_t			alloc;
-	struct xfs_inobt_key		inobt;
-	struct xfs_rmap_key		rmap;
-};
-
 /*
- * In-core key that holds both low and high keys for overlapped btrees.
- * The two keys are packed next to each other on disk, so do the same
- * in memory.  Preserve the existing xfs_btree_key as a single key to
- * avoid the mental model breakage that would happen if we passed a
- * bigkey into a function that operates on a single key.
+ * The in-core btree key.  Overlapping btrees actually store two keys
+ * per pointer, so we reserve enough memory to hold both.  The __*bigkey
+ * items should never be accessed directly.
  */
-union xfs_btree_bigkey {
+union xfs_btree_key {
 	struct xfs_bmbt_key		bmbt;
 	xfs_bmdr_key_t			bmbr;	/* bmbt root block */
 	xfs_alloc_key_t			alloc;
 	struct xfs_inobt_key		inobt;
-	struct {
-		struct xfs_rmap_key	rmap;
-		struct xfs_rmap_key	rmap_hi;
-	};
+	struct xfs_rmap_key		rmap;
+	struct xfs_rmap_key		__rmap_bigkey[2];
 };
 
 union xfs_btree_rec {
@@ -513,6 +501,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
 bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
 uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
 				 unsigned long len);
+xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits,
+		unsigned long long len);
 
 /* return codes */
 #define XFS_BTREE_QUERY_RANGE_CONTINUE	0	/* keep iterating */
@@ -529,4 +519,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
 int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
 		xfs_btree_visit_blocks_fn fn, void *data);
 
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index c221d0ecd52e..613c5cf19436 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -81,6 +81,10 @@
  *   - For each work item attached to the log intent item,
  *     * Perform the described action.
  *     * Attach the work item to the log done item.
+ *     * If the result of doing the work was -EAGAIN, ->finish work
+ *       wants a new transaction.  See the "Requesting a Fresh
+ *       Transaction while Finishing Deferred Work" section below for
+ *       details.
  *
  * The key here is that we must log an intent item for all pending
  * work items every time we roll the transaction, and that we must log
@@ -88,6 +92,34 @@
  * we can perform complex remapping operations, chaining intent items
  * as needed.
  *
+ * Requesting a Fresh Transaction while Finishing Deferred Work
+ *
+ * If ->finish_item decides that it needs a fresh transaction to
+ * finish the work, it must ask its caller (xfs_defer_finish) for a
+ * continuation.  The most likely cause of this circumstance are the
+ * refcount adjust functions deciding that they've logged enough items
+ * to be at risk of exceeding the transaction reservation.
+ *
+ * To get a fresh transaction, we want to log the existing log done
+ * item to prevent the log intent item from replaying, immediately log
+ * a new log intent item with the unfinished work items, roll the
+ * transaction, and re-call ->finish_item wherever it left off.  The
+ * log done item and the new log intent item must be in the same
+ * transaction or atomicity cannot be guaranteed; defer_finish ensures
+ * that this happens.
+ *
+ * This requires some coordination between ->finish_item and
+ * defer_finish.  Upon deciding to request a new transaction,
+ * ->finish_item should update the current work item to reflect the
+ * unfinished work.  Next, it should reset the log done item's list
+ * count to the number of items finished, and return -EAGAIN.
+ * defer_finish sees the -EAGAIN, logs the new log intent item
+ * with the remaining work items, and leaves the xfs_defer_pending
+ * item at the head of the dop_work queue.  Then it rolls the
+ * transaction and picks up processing where it left off.  It is
+ * required that ->finish_item must be careful to leave enough
+ * transaction reservation to fit the new log intent item.
+ *
  * This is an example of remapping the extent (E, E+B) into file X at
  * offset A and dealing with the extent (C, C+B) already being mapped
  * there:
@@ -104,21 +136,26 @@
  * | Intent to add rmap (X, E, A, B)                 |
  * +-------------------------------------------------+
  * | Reduce refcount for extent (C, B)               | t2
- * | Done reducing refcount for extent (C, B)        |
+ * | Done reducing refcount for extent (C, 9)        |
+ * | Intent to reduce refcount for extent (C+9, B-9) |
+ * | (ran out of space after 9 refcount updates)     |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C+9, B+9)           | t3
+ * | Done reducing refcount for extent (C+9, B-9)    |
  * | Increase refcount for extent (E, B)             |
  * | Done increasing refcount for extent (E, B)      |
  * | Intent to free extent (C, B)                    |
  * | Intent to free extent (F, 1) (refcountbt block) |
  * | Intent to remove rmap (F, 1, REFC)              |
  * +-------------------------------------------------+
- * | Remove rmap (X, C, A, B)                        | t3
+ * | Remove rmap (X, C, A, B)                        | t4
  * | Done removing rmap (X, C, A, B)                 |
  * | Add rmap (X, E, A, B)                           |
  * | Done adding rmap (X, E, A, B)                   |
  * | Remove rmap (F, 1, REFC)                        |
  * | Done removing rmap (F, 1, REFC)                 |
  * +-------------------------------------------------+
- * | Free extent (C, B)                              | t4
+ * | Free extent (C, B)                              | t5
  * | Done freeing extent (C, B)                      |
  * | Free extent (D, 1)                              |
  * | Done freeing extent (D, 1)                      |
@@ -141,6 +178,9 @@
  * - Intent to free extent (C, B)
  * - Intent to free extent (F, 1) (refcountbt block)
  * - Intent to remove rmap (F, 1, REFC)
+ *
+ * Note that the continuation requested between t2 and t3 is likely to
+ * reoccur.
  */
 
 static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
@@ -323,7 +363,16 @@ xfs_defer_finish(
 			dfp->dfp_count--;
 			error = dfp->dfp_type->finish_item(*tp, dop, li,
 					dfp->dfp_done, &state);
-			if (error) {
+			if (error == -EAGAIN) {
+				/*
+				 * Caller wants a fresh transaction;
+				 * put the work item back on the list
+				 * and jump out.
+				 */
+				list_add(li, &dfp->dfp_work);
+				dfp->dfp_count++;
+				break;
+			} else if (error) {
 				/*
 				 * Clean up after ourselves and jump out.
 				 * xfs_defer_cancel will take care of freeing
@@ -335,9 +384,25 @@ xfs_defer_finish(
 				goto out;
 			}
 		}
-		/* Done with the dfp, free it. */
-		list_del(&dfp->dfp_list);
-		kmem_free(dfp);
+		if (error == -EAGAIN) {
+			/*
+			 * Caller wants a fresh transaction, so log a
+			 * new log intent item to replace the old one
+			 * and roll the transaction.  See "Requesting
+			 * a Fresh Transaction while Finishing
+			 * Deferred Work" above.
+			 */
+			dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
+					dfp->dfp_count);
+			dfp->dfp_done = NULL;
+			list_for_each(li, &dfp->dfp_work)
+				dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
+						li);
+		} else {
+			/* Done with the dfp, free it. */
+			list_del(&dfp->dfp_list);
+			kmem_free(dfp);
+		}
 
 		if (cleanup_fn)
 			cleanup_fn(*tp, state, error);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 31ca2208c03d..eab68ae2e011 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -132,7 +132,7 @@ xfs_inobt_free_block(
 	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
 	return xfs_free_extent(cur->bc_tp,
 			XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
-			&oinfo);
+			&oinfo, XFS_AG_RESV_NONE);
 }
 
 STATIC int
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index a6eed43fa7cd..fc5eef85d61e 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -647,9 +647,17 @@ struct xfs_rui_log_format {
 	__uint16_t		rui_size;	/* size of this item */
 	__uint32_t		rui_nextents;	/* # extents to free */
 	__uint64_t		rui_id;		/* rui identifier */
-	struct xfs_map_extent	rui_extents[1];	/* array of extents to rmap */
+	struct xfs_map_extent	rui_extents[];	/* array of extents to rmap */
 };
 
+static inline size_t
+xfs_rui_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_rui_log_format) +
+			nr * sizeof(struct xfs_map_extent);
+}
+
 /*
  * This is the structure used to lay out an rud log item in the
  * log.  The rud_extents array is a variable size array whose
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index e455f9098d49..2975cb2319f4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -865,7 +865,7 @@ xfs_buf_item_log_segment(
 	 */
 	if (bit) {
 		end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
-		mask = ((1 << (end_bit - bit)) - 1) << bit;
+		mask = ((1U << (end_bit - bit)) - 1) << bit;
 		*wordp |= mask;
 		wordp++;
 		bits_set = end_bit - bit;
@@ -888,7 +888,7 @@ xfs_buf_item_log_segment(
 	 */
 	end_bit = bits_to_set - bits_set;
 	if (end_bit) {
-		mask = (1 << end_bit) - 1;
+		mask = (1U << end_bit) - 1;
 		*wordp |= mask;
 	}
 }
@@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error(
 	     bp->b_last_error != bp->b_error) {
 		bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
 		bp->b_last_error = bp->b_error;
-		if (cfg->retry_timeout && !bp->b_first_retry_time)
+		if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+		    !bp->b_first_retry_time)
 			bp->b_first_retry_time = jiffies;
 
 		xfs_buf_ioerror(bp, 0);
@@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error(
 	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
 	    ++bp->b_retries > cfg->max_retries)
 			goto permanent_error;
-	if (cfg->retry_timeout &&
+	if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
 	    time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
 			goto permanent_error;
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f99d7fac5abf..c68517b0f248 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -269,6 +269,8 @@ xfs_file_dio_aio_read(
 		return -EINVAL;
 	}
 
+	file_accessed(iocb->ki_filp);
+
 	/*
 	 * Locking is a bit tricky here. If we take an exclusive lock for direct
 	 * IO, we effectively serialise all new concurrent read IO to this file
@@ -323,7 +325,6 @@ xfs_file_dio_aio_read(
 	}
 	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 
-	file_accessed(iocb->ki_filp);
 	return ret;
 }
 
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 4a33a3304369..c8005fdaaa8a 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -30,6 +30,7 @@
 #include "xfs_mru_cache.h"
 #include "xfs_filestream.h"
 #include "xfs_trace.h"
+#include "xfs_ag_resv.h"
 
 struct xfs_fstrm_item {
 	struct xfs_mru_cache_elem	mru;
@@ -198,7 +199,8 @@ xfs_filestream_pick_ag(
 		}
 
 		longest = xfs_alloc_longest_free_extent(mp, pag,
-					xfs_alloc_min_freelist(mp, pag));
+				xfs_alloc_min_freelist(mp, pag),
+				xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
 		if (((minlen && longest >= minlen) ||
 		     (!minlen && pag->pagf_freeblks >= minfree)) &&
 		    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 0b7f986745c1..94ac06f3d908 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -553,7 +553,7 @@ xfs_growfs_data_private(
 		error = xfs_free_extent(tp,
 				XFS_AGB_TO_FSB(mp, agno,
 					be32_to_cpu(agf->agf_length) - new),
-				new, &oinfo);
+				new, &oinfo, XFS_AG_RESV_NONE);
 		if (error)
 			goto error0;
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b36676cde103..041d9493e798 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -57,10 +57,16 @@ enum {
 
 #define XFS_ERR_RETRY_FOREVER	-1
 
+/*
+ * Although retry_timeout is in jiffies which is normally an unsigned long,
+ * we limit the retry timeout to 86400 seconds, or one day.  So even a
+ * signed 32-bit long is sufficient for a HZ value up to 24855.  Making it
+ * signed lets us store the special "-1" value, meaning retry forever.
+ */
 struct xfs_error_cfg {
 	struct xfs_kobj	kobj;
 	int		max_retries;
-	unsigned long	retry_timeout;	/* in jiffies, 0 = no timeout */
+	long		retry_timeout;	/* in jiffies, -1 = infinite */
 };
 
 typedef struct xfs_mount {
@@ -325,6 +331,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp)
 }
 #endif
 
+/* per-AG block reservation data structures*/
+enum xfs_ag_resv_type {
+	XFS_AG_RESV_NONE = 0,
+	XFS_AG_RESV_METADATA,
+	XFS_AG_RESV_AGFL,
+};
+
+struct xfs_ag_resv {
+	/* number of blocks originally reserved here */
+	xfs_extlen_t			ar_orig_reserved;
+	/* number of blocks reserved here */
+	xfs_extlen_t			ar_reserved;
+	/* number of blocks originally asked for */
+	xfs_extlen_t			ar_asked;
+};
+
 /*
  * Per-ag incore structure, copies of information in agf and agi, to improve the
  * performance of allocation group selection.
@@ -372,8 +394,28 @@ typedef struct xfs_perag {
 	/* for rcu-safe freeing */
 	struct rcu_head	rcu_head;
 	int		pagb_count;	/* pagb slots in use */
+
+	/* Blocks reserved for all kinds of metadata. */
+	struct xfs_ag_resv	pag_meta_resv;
+	/* Blocks reserved for just AGFL-based metadata. */
+	struct xfs_ag_resv	pag_agfl_resv;
 } xfs_perag_t;
 
+static inline struct xfs_ag_resv *
+xfs_perag_resv(
+	struct xfs_perag	*pag,
+	enum xfs_ag_resv_type	type)
+{
+	switch (type) {
+	case XFS_AG_RESV_METADATA:
+		return &pag->pag_meta_resv;
+	case XFS_AG_RESV_AGFL:
+		return &pag->pag_agfl_resv;
+	default:
+		return NULL;
+	}
+}
+
 extern void	xfs_uuid_table_free(void);
 extern int	xfs_log_sbcount(xfs_mount_t *);
 extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 2500f28689d5..0432a459871c 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -51,28 +51,16 @@ xfs_rui_item_free(
 		kmem_zone_free(xfs_rui_zone, ruip);
 }
 
-/*
- * This returns the number of iovecs needed to log the given rui item.
- * We only need 1 iovec for an rui item.  It just logs the rui_log_format
- * structure.
- */
-static inline int
-xfs_rui_item_sizeof(
-	struct xfs_rui_log_item *ruip)
-{
-	return sizeof(struct xfs_rui_log_format) +
-			(ruip->rui_format.rui_nextents - 1) *
-			sizeof(struct xfs_map_extent);
-}
-
 STATIC void
 xfs_rui_item_size(
 	struct xfs_log_item	*lip,
 	int			*nvecs,
 	int			*nbytes)
 {
+	struct xfs_rui_log_item	*ruip = RUI_ITEM(lip);
+
 	*nvecs += 1;
-	*nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip));
+	*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
 }
 
 /*
@@ -97,7 +85,7 @@ xfs_rui_item_format(
 	ruip->rui_format.rui_size = 1;
 
 	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
-			xfs_rui_item_sizeof(ruip));
+			xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents));
 }
 
 /*
@@ -205,16 +193,12 @@ xfs_rui_init(
 
 {
 	struct xfs_rui_log_item		*ruip;
-	uint				size;
 
 	ASSERT(nextents > 0);
-	if (nextents > XFS_RUI_MAX_FAST_EXTENTS) {
-		size = (uint)(sizeof(struct xfs_rui_log_item) +
-			((nextents - 1) * sizeof(struct xfs_map_extent)));
-		ruip = kmem_zalloc(size, KM_SLEEP);
-	} else {
+	if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
+		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+	else
 		ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
-	}
 
 	xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
 	ruip->rui_format.rui_nextents = nextents;
@@ -239,14 +223,12 @@ xfs_rui_copy_format(
 	uint				len;
 
 	src_rui_fmt = buf->i_addr;
-	len = sizeof(struct xfs_rui_log_format) +
-			(src_rui_fmt->rui_nextents - 1) *
-			sizeof(struct xfs_map_extent);
+	len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
 
 	if (buf->i_len != len)
 		return -EFSCORRUPTED;
 
-	memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len);
+	memcpy(dst_rui_fmt, src_rui_fmt, len);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index aefcc3a318a5..340c968e1f9c 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -70,6 +70,14 @@ struct xfs_rui_log_item {
 	struct xfs_rui_log_format	rui_format;
 };
 
+static inline size_t
+xfs_rui_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_rui_log_item, rui_format) +
+			xfs_rui_log_format_sizeof(nr);
+}
+
 /*
  * This is the "rmap update done" log item.  It is used to log the fact that
  * some rmapbt updates mentioned in an earlier rui item have been performed.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fd6be45b3a1e..340975392e91 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1782,9 +1782,8 @@ xfs_init_zones(void)
 	if (!xfs_rud_zone)
 		goto out_destroy_icreate_zone;
 
-	xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) +
-			((XFS_RUI_MAX_FAST_EXTENTS - 1) *
-				sizeof(struct xfs_map_extent))),
+	xfs_rui_zone = kmem_zone_init(
+			xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
 			"xfs_rui_item");
 	if (!xfs_rui_zone)
 		goto out_destroy_rud_zone;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 79cfd3fc5324..5f8d55d29a11 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -393,9 +393,15 @@ max_retries_show(
 	struct kobject	*kobject,
 	char		*buf)
 {
+	int		retries;
 	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
 
-	return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+	if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+		retries = -1;
+	else
+		retries = cfg->max_retries;
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", retries);
 }
 
 static ssize_t
@@ -415,7 +421,10 @@ max_retries_store(
 	if (val < -1)
 		return -EINVAL;
 
-	cfg->max_retries = val;
+	if (val == -1)
+		cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+	else
+		cfg->max_retries = val;
 	return count;
 }
 XFS_SYSFS_ATTR_RW(max_retries);
@@ -425,10 +434,15 @@ retry_timeout_seconds_show(
 	struct kobject	*kobject,
 	char		*buf)
 {
+	int		timeout;
 	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
 
-	return snprintf(buf, PAGE_SIZE, "%ld\n",
-			jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+	if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+		timeout = -1;
+	else
+		timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", timeout);
 }
 
 static ssize_t
@@ -445,11 +459,16 @@ retry_timeout_seconds_store(
 	if (ret)
 		return ret;
 
-	/* 1 day timeout maximum */
-	if (val < 0 || val > 86400)
+	/* 1 day timeout maximum, -1 means infinite */
+	if (val < -1 || val > 86400)
 		return -EINVAL;
 
-	cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+	if (val == -1)
+		cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+	else {
+		cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+		ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX);
+	}
 	return count;
 }
 XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
@@ -519,18 +538,19 @@ struct xfs_error_init {
 static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
 	{ .name = "default",
 	  .max_retries = XFS_ERR_RETRY_FOREVER,
-	  .retry_timeout = 0,
+	  .retry_timeout = XFS_ERR_RETRY_FOREVER,
 	},
 	{ .name = "EIO",
 	  .max_retries = XFS_ERR_RETRY_FOREVER,
-	  .retry_timeout = 0,
+	  .retry_timeout = XFS_ERR_RETRY_FOREVER,
 	},
 	{ .name = "ENOSPC",
 	  .max_retries = XFS_ERR_RETRY_FOREVER,
-	  .retry_timeout = 0,
+	  .retry_timeout = XFS_ERR_RETRY_FOREVER,
 	},
 	{ .name = "ENODEV",
-	  .max_retries = 0,
+	  .max_retries = 0,	/* We can't recover from devices disappearing */
+	  .retry_timeout = 0,
 	},
 };
 
@@ -561,7 +581,10 @@ xfs_error_sysfs_init_class(
 			goto out_error;
 
 		cfg->max_retries = init[i].max_retries;
-		cfg->retry_timeout = msecs_to_jiffies(
+		if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER)
+			cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+		else
+			cfg->retry_timeout = msecs_to_jiffies(
 					init[i].retry_timeout * MSEC_PER_SEC);
 	}
 	return 0;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index d303a665dba9..c2a875fcf26e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1570,14 +1570,15 @@ TRACE_EVENT(xfs_agf,
 
 TRACE_EVENT(xfs_free_extent,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
-		 xfs_extlen_t len, bool isfl, int haveleft, int haveright),
-	TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+		 xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft,
+		 int haveright),
+	TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
-		__field(int, isfl)
+		__field(int, resv)
 		__field(int, haveleft)
 		__field(int, haveright)
 	),
@@ -1586,16 +1587,16 @@ TRACE_EVENT(xfs_free_extent,
 		__entry->agno = agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
-		__entry->isfl = isfl;
+		__entry->resv = resv;
 		__entry->haveleft = haveleft;
 		__entry->haveright = haveright;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+	TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->agbno,
 		  __entry->len,
-		  __entry->isfl,
+		  __entry->resv,
 		  __entry->haveleft ?
 			(__entry->haveright ? "both" : "left") :
 			(__entry->haveright ? "right" : "none"))
@@ -1622,7 +1623,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
 		__field(short, otype)
 		__field(char, wasdel)
 		__field(char, wasfromfl)
-		__field(char, isfl)
+		__field(int, resv)
 		__field(char, userdata)
 		__field(xfs_fsblock_t, firstblock)
 	),
@@ -1643,13 +1644,13 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
 		__entry->otype = args->otype;
 		__entry->wasdel = args->wasdel;
 		__entry->wasfromfl = args->wasfromfl;
-		__entry->isfl = args->isfl;
+		__entry->resv = args->resv;
 		__entry->userdata = args->userdata;
 		__entry->firstblock = args->firstblock;
 	),
 	TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
 		  "prod %u minleft %u total %u alignment %u minalignslop %u "
-		  "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
+		  "len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
 		  "userdata %d firstblock 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
@@ -1667,7 +1668,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
 		  __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
 		  __entry->wasdel,
 		  __entry->wasfromfl,
-		  __entry->isfl,
+		  __entry->resv,
 		  __entry->userdata,
 		  (unsigned long long)__entry->firstblock)
 )
@@ -2558,6 +2559,60 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
 
+/* per-AG reservation */
+DECLARE_EVENT_CLASS(xfs_ag_resv_class,
+	TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv,
+		 xfs_extlen_t len),
+	TP_ARGS(pag, resv, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, resv)
+		__field(xfs_extlen_t, freeblks)
+		__field(xfs_extlen_t, flcount)
+		__field(xfs_extlen_t, reserved)
+		__field(xfs_extlen_t, asked)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		struct xfs_ag_resv	*r = xfs_perag_resv(pag, resv);
+
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+		__entry->resv = resv;
+		__entry->freeblks = pag->pagf_freeblks;
+		__entry->flcount = pag->pagf_flcount;
+		__entry->reserved = r ? r->ar_reserved : 0;
+		__entry->asked = r ? r->ar_asked : 0;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->resv,
+		  __entry->freeblks,
+		  __entry->flcount,
+		  __entry->reserved,
+		  __entry->asked,
+		  __entry->len)
+)
+#define DEFINE_AG_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_ag_resv_class, name, \
+	TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \
+		 xfs_extlen_t len), \
+	TP_ARGS(pag, type, len))
+
+/* per-AG reservation tracepoints */
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_init);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
+
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 5f3d33d16e67..70f42ea86dfb 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -217,7 +217,7 @@ undo_log:
 
 undo_blocks:
 	if (blocks > 0) {
-		xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+		xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
 		tp->t_blk_res = 0;
 	}
 
@@ -318,7 +318,6 @@ xfs_trans_mod_sb(
 		 * in-core superblock's counter.  This should only
 		 * be applied to the on-disk superblock.
 		 */
-		ASSERT(delta < 0);
 		tp->t_res_fdblocks_delta += delta;
 		if (xfs_sb_version_haslazysbcount(&mp->m_sb))
 			flags &= ~XFS_TRANS_SB_DIRTY;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 459ddec137a4..ab438647592a 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -79,7 +79,8 @@ xfs_trans_free_extent(
 
 	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
 
-	error = xfs_free_extent(tp, start_block, ext_len, oinfo);
+	error = xfs_free_extent(tp, start_block, ext_len, oinfo,
+			XFS_AG_RESV_NONE);
 
 	/*
 	 * Mark the transaction dirty, even on error. This ensures the
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index ea62245fee26..62900938f26d 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -147,6 +147,7 @@ __xfs_xattr_put_listent(
 	arraytop = context->count + prefix_len + namelen + 1;
 	if (arraytop > context->firstu) {
 		context->count = -1;	/* insufficient space */
+		context->seen_enough = 1;
 		return 0;
 	}
 	offset = (char *)context->alist + context->count;