summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorBrian Foster <bfoster@redhat.com>2019-10-14 02:10:36 +0200
committerDarrick J. Wong <darrick.wong@oracle.com>2019-10-21 18:04:58 +0200
commitdc8e69bd721840bc22ffe5aa8598fd92b44f0334 (patch)
treebf294c0913017078ebcdda70ed4625a1dcf7566a /fs
parentxfs: factor out tree fixup logic into helper (diff)
downloadlinux-dc8e69bd721840bc22ffe5aa8598fd92b44f0334.tar.xz
linux-dc8e69bd721840bc22ffe5aa8598fd92b44f0334.zip
xfs: optimize near mode bnobt scans with concurrent cntbt lookups
The near mode fallback algorithm consists of a left/right scan of the bnobt. This algorithm has very poor breakdown characteristics under worst case free space fragmentation conditions. If a suitable extent is far enough from the locality hint, each allocation may scan most or all of the bnobt before it completes. This causes pathological behavior and extremely high allocation latencies. While locality is important to near mode allocations, it is not so important as to incur pathological allocation latency to provide the asolute best available locality for every allocation. If the allocation is large enough or far enough away, there is a point of diminishing returns. As such, we can bound the overall operation by including an iterative cntbt lookup in the broader search. The cntbt lookup is optimized to immediately find the extent with best locality for the given size on each iteration. Since the cntbt is indexed by extent size, the lookup repeats with a variably aggressive increasing search key size until it runs off the edge of the tree. This approach provides a natural balance between the two algorithms for various situations. For example, the bnobt scan is able to satisfy smaller allocations such as for inode chunks or btree blocks more quickly where the cntbt search may have to search through a large set of extent sizes when the search key starts off small relative to the largest extent in the tree. On the other hand, the cntbt search more deterministically covers the set of suitable extents for larger data extent allocation requests that the bnobt scan may have to search the entire tree to locate. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c154
-rw-r--r--fs/xfs/xfs_trace.h2
2 files changed, 144 insertions, 12 deletions
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 8191301a9039..925eba9489d5 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -716,6 +716,7 @@ struct xfs_alloc_cur {
struct xfs_btree_cur *cnt; /* btree cursors */
struct xfs_btree_cur *bnolt;
struct xfs_btree_cur *bnogt;
+ xfs_extlen_t cur_len;/* current search length */
xfs_agblock_t rec_bno;/* extent startblock */
xfs_extlen_t rec_len;/* extent length */
xfs_agblock_t bno; /* alloc bno */
@@ -740,6 +741,7 @@ xfs_alloc_cur_setup(
ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
+ acur->cur_len = args->maxlen;
acur->rec_bno = 0;
acur->rec_len = 0;
acur->bno = 0;
@@ -921,6 +923,76 @@ xfs_alloc_cur_finish(
}
/*
+ * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
+ * bno optimized lookup to search for extents with ideal size and locality.
+ */
+STATIC int
+xfs_alloc_cntbt_iter(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ struct xfs_btree_cur *cur = acur->cnt;
+ xfs_agblock_t bno;
+ xfs_extlen_t len, cur_len;
+ int error;
+ int i;
+
+ if (!xfs_alloc_cur_active(cur))
+ return 0;
+
+ /* locality optimized lookup */
+ cur_len = acur->cur_len;
+ error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
+ if (error)
+ return error;
+ if (i == 0)
+ return 0;
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (error)
+ return error;
+
+ /* check the current record and update search length from it */
+ error = xfs_alloc_cur_check(args, acur, cur, &i);
+ if (error)
+ return error;
+ ASSERT(len >= acur->cur_len);
+ acur->cur_len = len;
+
+ /*
+ * We looked up the first record >= [agbno, len] above. The agbno is a
+ * secondary key and so the current record may lie just before or after
+ * agbno. If it is past agbno, check the previous record too so long as
+ * the length matches as it may be closer. Don't check a smaller record
+ * because that could deactivate our cursor.
+ */
+ if (bno > args->agbno) {
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (!error && i) {
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (!error && i && len == acur->cur_len)
+ error = xfs_alloc_cur_check(args, acur, cur,
+ &i);
+ }
+ if (error)
+ return error;
+ }
+
+ /*
+ * Increment the search key until we find at least one allocation
+ * candidate or if the extent we found was larger. Otherwise, double the
+ * search key to optimize the search. Efficiency is more important here
+ * than absolute best locality.
+ */
+ cur_len <<= 1;
+ if (!acur->len || acur->cur_len >= cur_len)
+ acur->cur_len++;
+ else
+ acur->cur_len = cur_len;
+
+ return error;
+}
+
+/*
* Deal with the case where only small freespaces remain. Either return the
* contents of the last freespace record, or allocate space from the freelist if
* there is nothing in the tree.
@@ -1261,12 +1333,11 @@ xfs_alloc_walk_iter(
}
/*
- * Search in the by-bno btree to the left and to the right simultaneously until
- * in each case we find a large enough free extent or run into the edge of the
- * tree. When we run into the edge of the tree, we deactivate that cursor.
+ * Search the by-bno and by-size btrees in parallel in search of an extent with
+ * ideal locality based on the NEAR mode ->agbno locality hint.
*/
STATIC int
-xfs_alloc_ag_vextent_bnobt(
+xfs_alloc_ag_vextent_locality(
struct xfs_alloc_arg *args,
struct xfs_alloc_cur *acur,
int *stat)
@@ -1281,6 +1352,9 @@ xfs_alloc_ag_vextent_bnobt(
*stat = 0;
+ error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
+ if (error)
+ return error;
error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
if (error)
return error;
@@ -1289,12 +1363,37 @@ xfs_alloc_ag_vextent_bnobt(
return error;
/*
- * Loop going left with the leftward cursor, right with the rightward
- * cursor, until either both directions give up or we find an entry at
- * least as big as minlen.
+ * Search the bnobt and cntbt in parallel. Search the bnobt left and
+ * right and lookup the closest extent to the locality hint for each
+ * extent size key in the cntbt. The entire search terminates
+ * immediately on a bnobt hit because that means we've found best case
+ * locality. Otherwise the search continues until the cntbt cursor runs
+ * off the end of the tree. If no allocation candidate is found at this
+ * point, give up on locality, walk backwards from the end of the cntbt
+ * and take the first available extent.
+ *
+ * The parallel tree searches balance each other out to provide fairly
+ * consistent performance for various situations. The bnobt search can
+ * have pathological behavior in the worst case scenario of larger
+ * allocation requests and fragmented free space. On the other hand, the
+ * bnobt is able to satisfy most smaller allocation requests much more
+ * quickly than the cntbt. The cntbt search can sift through fragmented
+ * free space and sets of free extents for larger allocation requests
+ * more quickly than the bnobt. Since the locality hint is just a hint
+ * and we don't want to scan the entire bnobt for perfect locality, the
+ * cntbt search essentially bounds the bnobt search such that we can
+ * find good enough locality at reasonable performance in most cases.
*/
while (xfs_alloc_cur_active(acur->bnolt) ||
- xfs_alloc_cur_active(acur->bnogt)) {
+ xfs_alloc_cur_active(acur->bnogt) ||
+ xfs_alloc_cur_active(acur->cnt)) {
+
+ trace_xfs_alloc_cur_lookup(args);
+
+ /*
+ * Search the bnobt left and right. In the case of a hit, finish
+ * the search in the opposite direction and we're done.
+ */
error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
true, 1, &i);
if (error)
@@ -1305,7 +1404,6 @@ xfs_alloc_ag_vextent_bnobt(
fbinc = true;
break;
}
-
error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1, &i);
if (error)
@@ -1316,9 +1414,40 @@ xfs_alloc_ag_vextent_bnobt(
fbinc = false;
break;
}
+
+ /*
+ * Check the extent with best locality based on the current
+ * extent size search key and keep track of the best candidate.
+ */
+ error = xfs_alloc_cntbt_iter(args, acur);
+ if (error)
+ return error;
+ if (!xfs_alloc_cur_active(acur->cnt)) {
+ trace_xfs_alloc_cur_lookup_done(args);
+ break;
+ }
+ }
+
+ /*
+ * If we failed to find anything due to busy extents, return empty
+ * handed so the caller can flush and retry. If no busy extents were
+ * found, walk backwards from the end of the cntbt as a last resort.
+ */
+ if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
+ error = xfs_btree_decrement(acur->cnt, 0, &i);
+ if (error)
+ return error;
+ if (i) {
+ acur->cnt->bc_private.a.priv.abt.active = true;
+ fbcur = acur->cnt;
+ fbinc = false;
+ }
}
- /* search the opposite direction for a better entry */
+ /*
+ * Search in the opposite direction for a better entry in the case of
+ * a bnobt hit or walk backwards from the end of the cntbt.
+ */
if (fbcur) {
error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
&i);
@@ -1447,9 +1576,10 @@ restart:
}
/*
- * Second algorithm. Search the bnobt left and right.
+ * Second algorithm. Combined cntbt and bnobt search to find ideal
+ * locality.
*/
- error = xfs_alloc_ag_vextent_bnobt(args, &acur, &i);
+ error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
if (error)
goto out;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2eef14791cc5..926f4d10dc02 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1580,6 +1580,8 @@ DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
DEFINE_ALLOC_EVENT(xfs_alloc_cur);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_right);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_left);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done);
DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);