summaryrefslogtreecommitdiffstats
path: root/fs/xfs/libxfs
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-08-18 07:17:32 +0200
committerDavid S. Miller <davem@davemloft.net>2016-08-18 07:17:32 +0200
commit60747ef4d173c2747bf7f0377fb22846cb422195 (patch)
treeea0faf33b952495c47909be1400c475a3f3821b0 /fs/xfs/libxfs
parentMerge branch 'strparser' (diff)
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (diff)
downloadlinux-60747ef4d173c2747bf7f0377fb22846cb422195.tar.xz
linux-60747ef4d173c2747bf7f0377fb22846cb422195.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Minor overlapping changes for both merge conflicts. Resolution work done by Stephen Rothwell was used as a reference. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'fs/xfs/libxfs')
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c149
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h52
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c12
-rw-r--r--fs/xfs/libxfs/xfs_attr.c71
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c19
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c241
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h54
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_btree.c914
-rw-r--r--fs/xfs/libxfs/xfs_btree.h88
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h4
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h1
-rw-r--r--fs/xfs/libxfs/xfs_defer.c463
-rw-r--r--fs/xfs/libxfs/xfs_defer.h97
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c15
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h8
-rw-r--r--fs/xfs/libxfs/xfs_format.h131
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c23
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c1
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h63
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c1399
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h209
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c511
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h61
-rw-r--r--fs/xfs/libxfs/xfs_sb.c9
-rw-r--r--fs/xfs/libxfs/xfs_shared.h2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c62
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h10
-rw-r--r--fs/xfs/libxfs/xfs_types.h4
34 files changed, 4207 insertions, 529 deletions
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 88c26b827a2d..776ae2f325d1 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -24,8 +24,10 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_rmap.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
@@ -49,6 +51,81 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+xfs_extlen_t
+xfs_prealloc_blocks(
+ struct xfs_mount *mp)
+{
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return XFS_RMAP_BLOCK(mp) + 1;
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ return XFS_FIBT_BLOCK(mp) + 1;
+ return XFS_IBT_BLOCK(mp) + 1;
+}
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
+ * AGF buffer (PV 947395), we place constraints on the relationship among
+ * actual allocations for data blocks, freelist blocks, and potential file data
+ * bmap btree blocks. However, these restrictions may result in no actual space
+ * allocated for a delayed extent, for example, a data block in a certain AG is
+ * allocated but there is no additional block for the additional bmap btree
+ * block due to a split of the bmap btree of the file. The result of this may
+ * lead to an infinite loop when the file gets flushed to disk and all delayed
+ * extents need to be actually allocated. To get around this, we explicitly set
+ * aside a few blocks which will not be reserved in delayed allocation.
+ *
+ * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist
+ * and 4 more to handle a potential split of the file's bmap btree.
+ *
+ * When rmap is enabled, we must also be able to handle two rmap btree inserts
+ * to record both the file data extent and a new bmbt block. The bmbt block
+ * might not be in the same AG as the file data extent. In the worst case
+ * the bmap btree splits multiple levels and all the new blocks come from
+ * different AGs, so set aside enough to handle rmap btree splits in all AGs.
+ */
+unsigned int
+xfs_alloc_set_aside(
+ struct xfs_mount *mp)
+{
+ unsigned int blocks;
+
+ blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
+ return blocks;
+}
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ * - the AG superblock, AGF, AGI and AGFL
+ * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
+ * the AGI free inode and rmap btree root blocks.
+ * - blocks on the AGFL according to xfs_alloc_set_aside() limits
+ * - the rmapbt root block
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+unsigned int
+xfs_alloc_ag_max_usable(
+ struct xfs_mount *mp)
+{
+ unsigned int blocks;
+
+ blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
+ blocks += XFS_ALLOC_AGFL_RESERVE;
+ blocks += 3; /* AGF, AGI btree root blocks */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ blocks++; /* finobt root block */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ blocks++; /* rmap root block */
+
+ return mp->m_sb.sb_agblocks - blocks;
+}
+
/*
* Lookup the record equal to [bno, len] in the btree given by cur.
*/
@@ -636,6 +713,14 @@ xfs_alloc_ag_vextent(
ASSERT(!args->wasfromfl || !args->isfl);
ASSERT(args->agbno % args->alignment == 0);
+ /* if not file data, insert new block into the reverse map btree */
+ if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+ error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
+ args->agbno, args->len, &args->oinfo);
+ if (error)
+ return error;
+ }
+
if (!args->wasfromfl) {
error = xfs_alloc_update_counters(args->tp, args->pag,
args->agbp,
@@ -1577,14 +1662,15 @@ error0:
/*
* Free the extent starting at agno/bno for length.
*/
-STATIC int /* error */
+STATIC int
xfs_free_ag_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *agbp, /* buffer for a.g. freelist header */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t bno, /* starting block number */
- xfs_extlen_t len, /* length of extent */
- int isfl) /* set if is freelist blocks - no sb acctg */
+ xfs_trans_t *tp,
+ xfs_buf_t *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo,
+ int isfl)
{
xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
@@ -1601,12 +1687,19 @@ xfs_free_ag_extent(
xfs_extlen_t nlen; /* new length of freespace */
xfs_perag_t *pag; /* per allocation group data */
+ bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;
+
+ if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+ error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
+ if (error)
+ goto error0;
+ }
+
/*
* Allocate and initialize a cursor for the by-block btree.
*/
bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
- cnt_cur = NULL;
/*
* Look for a neighboring block on the left (lower block numbers)
* that is contiguous with this space.
@@ -1875,6 +1968,11 @@ xfs_alloc_min_freelist(
/* space needed by-size freespace btree */
min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
mp->m_ag_maxlevels);
+ /* space needed reverse mapping used space btree */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ min_free += min_t(unsigned int,
+ pag->pagf_levels[XFS_BTNUM_RMAPi] + 1,
+ mp->m_rmap_maxlevels);
return min_free;
}
@@ -1992,21 +2090,34 @@ xfs_alloc_fix_freelist(
* anything other than extra overhead when we need to put more blocks
* back on the free list? Maybe we should only do this when space is
* getting low or the AGFL is more than half full?
+ *
+ * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
+ * big; the NORMAP flag prevents AGFL expand/shrink operations from
+ * updating the rmapbt. Both flags are used in xfs_repair while we're
+ * rebuilding the rmapbt, and neither are used by the kernel. They're
+ * both required to ensure that rmaps are correctly recorded for the
+ * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
+ * repair/rmap.c in xfsprogs for details.
*/
- while (pag->pagf_flcount > need) {
+ memset(&targs, 0, sizeof(targs));
+ if (flags & XFS_ALLOC_FLAG_NORMAP)
+ xfs_rmap_skip_owner_update(&targs.oinfo);
+ else
+ xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG);
+ while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
struct xfs_buf *bp;
error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
if (error)
goto out_agbp_relse;
- error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1);
+ error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
+ &targs.oinfo, 1);
if (error)
goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
xfs_trans_binval(tp, bp);
}
- memset(&targs, 0, sizeof(targs));
targs.tp = tp;
targs.mp = mp;
targs.agbp = agbp;
@@ -2271,6 +2382,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
return false;
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)
+ return false;
+
/*
* during growfs operations, the perag is not fully initialised,
* so we can't use it for any useful checking. growfs ensures we can't
@@ -2402,6 +2517,8 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
pag->pagf_levels[XFS_BTNUM_CNTi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+ pag->pagf_levels[XFS_BTNUM_RMAPi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
spin_lock_init(&pag->pagb_lock);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
@@ -2691,7 +2808,8 @@ int /* error */
xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
- xfs_extlen_t len) /* length of extent */
+ xfs_extlen_t len, /* length of extent */
+ struct xfs_owner_info *oinfo) /* extent owner */
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agbp;
@@ -2701,6 +2819,11 @@ xfs_free_extent(
ASSERT(len != 0);
+ if (XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_FREE_EXTENT,
+ XFS_RANDOM_FREE_EXTENT))
+ return -EIO;
+
error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
if (error)
return error;
@@ -2712,7 +2835,7 @@ xfs_free_extent(
agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
err);
- error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
+ error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0);
if (error)
goto err;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index cf268b2d0b6c..6fe2d6b7cfe9 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -54,41 +54,8 @@ typedef unsigned int xfs_alloctype_t;
*/
#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */
#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/
-
-/*
- * In order to avoid ENOSPC-related deadlock caused by
- * out-of-order locking of AGF buffer (PV 947395), we place
- * constraints on the relationship among actual allocations for
- * data blocks, freelist blocks, and potential file data bmap
- * btree blocks. However, these restrictions may result in no
- * actual space allocated for a delayed extent, for example, a data
- * block in a certain AG is allocated but there is no additional
- * block for the additional bmap btree block due to a split of the
- * bmap btree of the file. The result of this may lead to an
- * infinite loop in xfssyncd when the file gets flushed to disk and
- * all delayed extents need to be actually allocated. To get around
- * this, we explicitly set aside a few blocks which will not be
- * reserved in delayed allocation. Considering the minimum number of
- * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
- * btree requires 1 fsb, so we set the number of set-aside blocks
- * to 4 + 4*agcount.
- */
-#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
-
-/*
- * When deciding how much space to allocate out of an AG, we limit the
- * allocation maximum size to the size the AG. However, we cannot use all the
- * blocks in the AG - some are permanently used by metadata. These
- * blocks are generally:
- * - the AG superblock, AGF, AGI and AGFL
- * - the AGF (bno and cnt) and AGI btree root blocks
- * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
- *
- * The AG headers are sector sized, so the amount of space they take up is
- * dependent on filesystem geometry. The others are all single blocks.
- */
-#define XFS_ALLOC_AG_MAX_USABLE(mp) \
- ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */
+#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */
/*
@@ -123,6 +90,7 @@ typedef struct xfs_alloc_arg {
char isfl; /* set if is freelist blocks - !acctg */
char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
+ struct xfs_owner_info oinfo; /* owner of blocks being allocated */
} xfs_alloc_arg_t;
/*
@@ -132,6 +100,11 @@ typedef struct xfs_alloc_arg {
#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
+/* freespace limit calculations */
+#define XFS_ALLOC_AGFL_RESERVE 4
+unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
+unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
+
xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag, xfs_extlen_t need);
unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
@@ -208,9 +181,10 @@ xfs_alloc_vextent(
*/
int /* error */
xfs_free_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t bno, /* starting block number of extent */
- xfs_extlen_t len); /* length of extent */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t bno, /* starting block number of extent */
+ xfs_extlen_t len, /* length of extent */
+ struct xfs_owner_info *oinfo);/* extent owner */
int /* error */
xfs_alloc_lookup_ge(
@@ -232,4 +206,6 @@ int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
struct xfs_buf **agbp);
+xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index d9b42425291e..5ba2dac5e67c 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -212,17 +212,6 @@ xfs_allocbt_init_key_from_rec(
}
STATIC void
-xfs_allocbt_init_rec_from_key(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
-{
- ASSERT(key->alloc.ar_startblock != 0);
-
- rec->alloc.ar_startblock = key->alloc.ar_startblock;
- rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
-}
-
-STATIC void
xfs_allocbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
@@ -406,7 +395,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.get_minrecs = xfs_allocbt_get_minrecs,
.get_maxrecs = xfs_allocbt_get_maxrecs,
.init_key_from_rec = xfs_allocbt_init_key_from_rec,
- .init_rec_from_key = xfs_allocbt_init_rec_from_key,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_allocbt_key_diff,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 4e126f41a0aa..af1ecb19121e 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -23,6 +23,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr_sf.h"
@@ -203,7 +204,7 @@ xfs_attr_set(
{
struct xfs_mount *mp = dp->i_mount;
struct xfs_da_args args;
- struct xfs_bmap_free flist;
+ struct xfs_defer_ops dfops;
struct xfs_trans_res tres;
xfs_fsblock_t firstblock;
int rsvd = (flags & ATTR_ROOT) != 0;
@@ -221,7 +222,7 @@ xfs_attr_set(
args.value = value;
args.valuelen = valuelen;
args.firstblock = &firstblock;
- args.flist = &flist;
+ args.dfops = &dfops;
args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
args.total = xfs_attr_calc_size(&args, &local);
@@ -316,13 +317,13 @@ xfs_attr_set(
* It won't fit in the shortform, transform to a leaf block.
* GROT: another possible req'mt for a double-split btree op.
*/
- xfs_bmap_init(args.flist, args.firstblock);
+ xfs_defer_init(args.dfops, args.firstblock);
error = xfs_attr_shortform_to_leaf(&args);
if (!error)
- error = xfs_bmap_finish(&args.trans, args.flist, dp);
+ error = xfs_defer_finish(&args.trans, args.dfops, dp);
if (error) {
args.trans = NULL;
- xfs_bmap_cancel(&flist);
+ xfs_defer_cancel(&dfops);
goto out;
}
@@ -382,7 +383,7 @@ xfs_attr_remove(
{
struct xfs_mount *mp = dp->i_mount;
struct xfs_da_args args;
- struct xfs_bmap_free flist;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t firstblock;
int error;
@@ -399,7 +400,7 @@ xfs_attr_remove(
return error;
args.firstblock = &firstblock;
- args.flist = &flist;
+ args.dfops = &dfops;
/*
* we have no control over the attribute names that userspace passes us
@@ -584,13 +585,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Commit that transaction so that the node_addname() call
* can manage its own transactions.
*/
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
return error;
}
@@ -674,15 +675,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* If the result is small enough, shrink it all into the inode.
*/
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error)
- error = xfs_bmap_finish(&args->trans,
- args->flist, dp);
+ error = xfs_defer_finish(&args->trans,
+ args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
return error;
}
}
@@ -737,14 +738,14 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
* If the result is small enough, shrink it all into the inode.
*/
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
return error;
}
}
@@ -863,14 +864,14 @@ restart:
*/
xfs_da_state_free(state);
state = NULL;
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
if (!error)
- error = xfs_bmap_finish(&args->trans,
- args->flist, dp);
+ error = xfs_defer_finish(&args->trans,
+ args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
goto out;
}
@@ -891,13 +892,13 @@ restart:
* in the index/blkno/rmtblkno/rmtblkcnt fields and
* in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
*/
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_da3_split(state);
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
goto out;
}
} else {
@@ -990,14 +991,14 @@ restart:
* Check to see if the tree needs to be collapsed.
*/
if (retval && (state->path.active > 1)) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_da3_join(state);
if (!error)
- error = xfs_bmap_finish(&args->trans,
- args->flist, dp);
+ error = xfs_defer_finish(&args->trans,
+ args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
goto out;
}
}
@@ -1113,13 +1114,13 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* Check to see if the tree needs to be collapsed.
*/
if (retval && (state->path.active > 1)) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_da3_join(state);
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
goto out;
}
/*
@@ -1146,15 +1147,15 @@ xfs_attr_node_removename(xfs_da_args_t *args)
goto out;
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error)
- error = xfs_bmap_finish(&args->trans,
- args->flist, dp);
+ error = xfs_defer_finish(&args->trans,
+ args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
goto out;
}
} else
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 01a5ecfedfcf..8ea91f363093 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -792,7 +792,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
nargs.dp = dp;
nargs.geo = args->geo;
nargs.firstblock = args->firstblock;
- nargs.flist = args->flist;
+ nargs.dfops = args->dfops;
nargs.total = args->total;
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
@@ -922,7 +922,7 @@ xfs_attr3_leaf_to_shortform(
nargs.geo = args->geo;
nargs.dp = dp;
nargs.firstblock = args->firstblock;
- nargs.flist = args->flist;
+ nargs.dfops = args->dfops;
nargs.total = args->total;
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index a572532a55cd..d52f525f5b2d 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -24,6 +24,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
@@ -460,16 +461,16 @@ xfs_attr_rmtval_set(
* extent and then crash then the block may not contain the
* correct metadata after log recovery occurs.
*/
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
nmap = 1;
error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
- args->total, &map, &nmap, args->flist);
+ args->total, &map, &nmap, args->dfops);
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops, dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
return error;
}
@@ -503,7 +504,7 @@ xfs_attr_rmtval_set(
ASSERT(blkcnt > 0);
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
nmap = 1;
error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
blkcnt, &map, &nmap,
@@ -603,16 +604,16 @@ xfs_attr_rmtval_remove(
blkcnt = args->rmtblkcnt;
done = 0;
while (!done) {
- xfs_bmap_init(args->flist, args->firstblock);
+ xfs_defer_init(args->dfops, args->firstblock);
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
XFS_BMAPI_ATTRFORK, 1, args->firstblock,
- args->flist, &done);
+ args->dfops, &done);
if (!error)
- error = xfs_bmap_finish(&args->trans, args->flist,
+ error = xfs_defer_finish(&args->trans, args->dfops,
args->dp);
if (error) {
args->trans = NULL;
- xfs_bmap_cancel(args->flist);
+ xfs_defer_cancel(args->dfops);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 2f2c85cc8117..b060bca93402 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -24,6 +24,7 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_dir2.h"
@@ -45,6 +46,7 @@
#include "xfs_symlink.h"
#include "xfs_attr_leaf.h"
#include "xfs_filestream.h"
+#include "xfs_rmap.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -570,12 +572,13 @@ xfs_bmap_validate_ret(
*/
void
xfs_bmap_add_free(
- struct xfs_mount *mp, /* mount point structure */
- struct xfs_bmap_free *flist, /* list of extents */
- xfs_fsblock_t bno, /* fs block number of extent */
- xfs_filblks_t len) /* length of extent */
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ struct xfs_owner_info *oinfo)
{
- struct xfs_bmap_free_item *new; /* new element */
+ struct xfs_extent_free_item *new; /* new element */
#ifdef DEBUG
xfs_agnumber_t agno;
xfs_agblock_t agbno;
@@ -592,44 +595,17 @@ xfs_bmap_add_free(
ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
#endif
ASSERT(xfs_bmap_free_item_zone != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
- new->xbfi_startblock = bno;
- new->xbfi_blockcount = (xfs_extlen_t)len;
- list_add(&new->xbfi_list, &flist->xbf_flist);
- flist->xbf_count++;
-}
-
-/*
- * Remove the entry "free" from the free item list. Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-void
-xfs_bmap_del_free(
- struct xfs_bmap_free *flist, /* free item list header */
- struct xfs_bmap_free_item *free) /* list item to be freed */
-{
- list_del(&free->xbfi_list);
- flist->xbf_count--;
- kmem_zone_free(xfs_bmap_free_item_zone, free);
-}
-
-/*
- * Free up any items left in the list.
- */
-void
-xfs_bmap_cancel(
- struct xfs_bmap_free *flist) /* list of bmap_free_items */
-{
- struct xfs_bmap_free_item *free; /* free list item */
- if (flist->xbf_count == 0)
- return;
- while (!list_empty(&flist->xbf_flist)) {
- free = list_first_entry(&flist->xbf_flist,
- struct xfs_bmap_free_item, xbfi_list);
- xfs_bmap_del_free(flist, free);
- }
- ASSERT(flist->xbf_count == 0);
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new->xefi_startblock = bno;
+ new->xefi_blockcount = (xfs_extlen_t)len;
+ if (oinfo)
+ new->xefi_oinfo = *oinfo;
+ else
+ xfs_rmap_skip_owner_update(&new->xefi_oinfo);
+ trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0,
+ XFS_FSB_TO_AGBNO(mp, bno), len);
+ xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
}
/*
@@ -659,6 +635,7 @@ xfs_bmap_btree_to_extents(
xfs_mount_t *mp; /* mount point structure */
__be64 *pp; /* ptr to block address */
struct xfs_btree_block *rblock;/* root btree block */
+ struct xfs_owner_info oinfo;
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -682,7 +659,8 @@ xfs_bmap_btree_to_extents(
cblock = XFS_BUF_TO_BLOCK(cbp);
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
- xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1);
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
+ xfs_bmap_add_free(mp, cur->bc_private.b.dfops, cbno, 1, &oinfo);
ip->i_d.di_nblocks--;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
xfs_trans_binval(tp, cbp);
@@ -705,7 +683,7 @@ xfs_bmap_extents_to_btree(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
xfs_fsblock_t *firstblock, /* first-block-allocated */
- xfs_bmap_free_t *flist, /* blocks freed in xaction */
+ struct xfs_defer_ops *dfops, /* blocks freed in xaction */
xfs_btree_cur_t **curp, /* cursor returned to caller */
int wasdel, /* converting a delayed alloc */
int *logflagsp, /* inode logging flags */
@@ -754,7 +732,7 @@ xfs_bmap_extents_to_btree(
*/
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
- cur->bc_private.b.flist = flist;
+ cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
/*
* Convert to a btree with two levels, one record in root.
@@ -763,11 +741,12 @@ xfs_bmap_extents_to_btree(
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = mp;
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork);
args.firstblock = *firstblock;
if (*firstblock == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
- } else if (flist->xbf_low) {
+ } else if (dfops->dop_low) {
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = *firstblock;
} else {
@@ -788,7 +767,7 @@ xfs_bmap_extents_to_btree(
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(*firstblock == NULLFSBLOCK ||
args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
- (flist->xbf_low &&
+ (dfops->dop_low &&
args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
*firstblock = cur->bc_private.b.firstblock = args.fsbno;
cur->bc_private.b.allocated++;
@@ -909,6 +888,7 @@ xfs_bmap_local_to_extents(
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = ip->i_mount;
+ xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0);
args.firstblock = *firstblock;
/*
* Allocate a block. We know we need only one, since the
@@ -973,7 +953,7 @@ xfs_bmap_add_attrfork_btree(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
+ struct xfs_defer_ops *dfops, /* blocks to free at commit */
int *flags) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* btree cursor */
@@ -986,7 +966,7 @@ xfs_bmap_add_attrfork_btree(
*flags |= XFS_ILOG_DBROOT;
else {
cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
- cur->bc_private.b.flist = flist;
+ cur->bc_private.b.dfops = dfops;
cur->bc_private.b.firstblock = *firstblock;
if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
goto error0;
@@ -1016,7 +996,7 @@ xfs_bmap_add_attrfork_extents(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
+ struct xfs_defer_ops *dfops, /* blocks to free at commit */
int *flags) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* bmap btree cursor */
@@ -1025,7 +1005,7 @@ xfs_bmap_add_attrfork_extents(
if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
return 0;
cur = NULL;
- error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+ error = xfs_bmap_extents_to_btree(tp, ip, firstblock, dfops, &cur, 0,
flags, XFS_DATA_FORK);
if (cur) {
cur->bc_private.b.allocated = 0;
@@ -1051,7 +1031,7 @@ xfs_bmap_add_attrfork_local(
xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
+ struct xfs_defer_ops *dfops, /* blocks to free at commit */
int *flags) /* inode logging flags */
{
xfs_da_args_t dargs; /* args for dir/attr code */
@@ -1064,7 +1044,7 @@ xfs_bmap_add_attrfork_local(
dargs.geo = ip->i_mount->m_dir_geo;
dargs.dp = ip;
dargs.firstblock = firstblock;
- dargs.flist = flist;
+ dargs.dfops = dfops;
dargs.total = dargs.geo->fsbcount;
dargs.whichfork = XFS_DATA_FORK;
dargs.trans = tp;
@@ -1092,7 +1072,7 @@ xfs_bmap_add_attrfork(
int rsvd) /* xact may use reserved blks */
{
xfs_fsblock_t firstblock; /* 1st block/ag allocated */
- xfs_bmap_free_t flist; /* freed extent records */
+ struct xfs_defer_ops dfops; /* freed extent records */
xfs_mount_t *mp; /* mount structure */
xfs_trans_t *tp; /* transaction pointer */
int blks; /* space reservation */
@@ -1158,18 +1138,18 @@ xfs_bmap_add_attrfork(
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
ip->i_afp->if_flags = XFS_IFEXTENTS;
logflags = 0;
- xfs_bmap_init(&flist, &firstblock);
+ xfs_defer_init(&dfops, &firstblock);
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_LOCAL:
- error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+ error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &dfops,
&logflags);
break;
case XFS_DINODE_FMT_EXTENTS:
error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
- &flist, &logflags);
+ &dfops, &logflags);
break;
case XFS_DINODE_FMT_BTREE:
- error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+ error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &dfops,
&logflags);
break;
default:
@@ -1198,7 +1178,7 @@ xfs_bmap_add_attrfork(
xfs_log_sb(tp);
}
- error = xfs_bmap_finish(&tp, &flist, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto bmap_cancel;
error = xfs_trans_commit(tp);
@@ -1206,7 +1186,7 @@ xfs_bmap_add_attrfork(
return error;
bmap_cancel:
- xfs_bmap_cancel(&flist);
+ xfs_defer_cancel(&dfops);
trans_cancel:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -2003,7 +1983,7 @@ xfs_bmap_add_extent_delay_real(
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->flist,
+ bma->firstblock, bma->dfops,
&bma->cur, 1, &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
@@ -2087,7 +2067,7 @@ xfs_bmap_add_extent_delay_real(
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->flist, &bma->cur, 1,
+ bma->firstblock, bma->dfops, &bma->cur, 1,
&tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
@@ -2156,7 +2136,7 @@ xfs_bmap_add_extent_delay_real(
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->flist, &bma->cur,
+ bma->firstblock, bma->dfops, &bma->cur,
1, &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
@@ -2199,13 +2179,18 @@ xfs_bmap_add_extent_delay_real(
ASSERT(0);
}
+ /* add reverse mapping */
+ error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
+ if (error)
+ goto done;
+
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->flist, &bma->cur,
+ bma->firstblock, bma->dfops, &bma->cur,
da_old > 0, &tmp_logflags, whichfork);
bma->logflags |= tmp_logflags;
if (error)
@@ -2247,7 +2232,7 @@ xfs_bmap_add_extent_unwritten_real(
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
+ struct xfs_defer_ops *dfops, /* list of extents to be freed */
int *logflagsp) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* btree cursor */
@@ -2735,12 +2720,17 @@ xfs_bmap_add_extent_unwritten_real(
ASSERT(0);
}
+ /* update reverse mappings */
+ error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new);
+ if (error)
+ goto done;
+
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(cur == NULL);
- error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+ error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur,
0, &tmp_logflags, XFS_DATA_FORK);
*logflagsp |= tmp_logflags;
if (error)
@@ -3127,13 +3117,18 @@ xfs_bmap_add_extent_hole_real(
break;
}
+ /* add reverse mapping */
+ error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
+ if (error)
+ goto done;
+
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->flist, &bma->cur,
+ bma->firstblock, bma->dfops, &bma->cur,
0, &tmp_logflags, whichfork);
bma->logflags |= tmp_logflags;
if (error)
@@ -3691,9 +3686,10 @@ xfs_bmap_btalloc(
args.tp = ap->tp;
args.mp = mp;
args.fsbno = ap->blkno;
+ xfs_rmap_skip_owner_update(&args.oinfo);
/* Trim the allocation back to the maximum an AG can fit. */
- args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
+ args.maxlen = MIN(ap->length, mp->m_ag_max_usable);
args.firstblock = *ap->firstblock;
blen = 0;
if (nullfb) {
@@ -3708,7 +3704,7 @@ xfs_bmap_btalloc(
error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
if (error)
return error;
- } else if (ap->flist->xbf_low) {
+ } else if (ap->dfops->dop_low) {
if (xfs_inode_is_filestream(ap->ip))
args.type = XFS_ALLOCTYPE_FIRST_AG;
else
@@ -3741,7 +3737,7 @@ xfs_bmap_btalloc(
* is >= the stripe unit and the allocation offset is
* at the end of file.
*/
- if (!ap->flist->xbf_low && ap->aeof) {
+ if (!ap->dfops->dop_low && ap->aeof) {
if (!ap->offset) {
args.alignment = stripe_align;
atype = args.type;
@@ -3834,7 +3830,7 @@ xfs_bmap_btalloc(
args.minleft = 0;
if ((error = xfs_alloc_vextent(&args)))
return error;
- ap->flist->xbf_low = 1;
+ ap->dfops->dop_low = true;
}
if (args.fsbno != NULLFSBLOCK) {
/*
@@ -3844,7 +3840,7 @@ xfs_bmap_btalloc(
ASSERT(*ap->firstblock == NULLFSBLOCK ||
XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
XFS_FSB_TO_AGNO(mp, args.fsbno) ||
- (ap->flist->xbf_low &&
+ (ap->dfops->dop_low &&
XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
XFS_FSB_TO_AGNO(mp, args.fsbno)));
@@ -3852,7 +3848,7 @@ xfs_bmap_btalloc(
if (*ap->firstblock == NULLFSBLOCK)
*ap->firstblock = args.fsbno;
ASSERT(nullfb || fb_agno == args.agno ||
- (ap->flist->xbf_low && fb_agno < args.agno));
+ (ap->dfops->dop_low && fb_agno < args.agno));
ap->length = args.len;
ap->ip->i_d.di_nblocks += args.len;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
@@ -4319,7 +4315,7 @@ xfs_bmapi_allocate(
if (error)
return error;
- if (bma->flist->xbf_low)
+ if (bma->dfops->dop_low)
bma->minleft = 0;
if (bma->cur)
bma->cur->bc_private.b.firstblock = *bma->firstblock;
@@ -4328,7 +4324,7 @@ xfs_bmapi_allocate(
if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
bma->cur->bc_private.b.firstblock = *bma->firstblock;
- bma->cur->bc_private.b.flist = bma->flist;
+ bma->cur->bc_private.b.dfops = bma->dfops;
}
/*
* Bump the number of extents we've allocated
@@ -4409,7 +4405,7 @@ xfs_bmapi_convert_unwritten(
bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
bma->ip, whichfork);
bma->cur->bc_private.b.firstblock = *bma->firstblock;
- bma->cur->bc_private.b.flist = bma->flist;
+ bma->cur->bc_private.b.dfops = bma->dfops;
}
mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
@@ -4426,7 +4422,7 @@ xfs_bmapi_convert_unwritten(
}
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
- &bma->cur, mval, bma->firstblock, bma->flist,
+ &bma->cur, mval, bma->firstblock, bma->dfops,
&tmp_logflags);
/*
* Log the inode core unconditionally in the unwritten extent conversion
@@ -4480,7 +4476,7 @@ xfs_bmapi_write(
xfs_extlen_t total, /* total blocks needed */
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap, /* i/o: mval size/count */
- struct xfs_bmap_free *flist) /* i/o: list extents to free */
+ struct xfs_defer_ops *dfops) /* i/o: list extents to free */
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
@@ -4570,7 +4566,7 @@ xfs_bmapi_write(
bma.ip = ip;
bma.total = total;
bma.userdata = 0;
- bma.flist = flist;
+ bma.dfops = dfops;
bma.firstblock = firstblock;
while (bno < end && n < *nmap) {
@@ -4684,7 +4680,7 @@ error0:
XFS_FSB_TO_AGNO(mp, *firstblock) ==
XFS_FSB_TO_AGNO(mp,
bma.cur->bc_private.b.firstblock) ||
- (flist->xbf_low &&
+ (dfops->dop_low &&
XFS_FSB_TO_AGNO(mp, *firstblock) <
XFS_FSB_TO_AGNO(mp,
bma.cur->bc_private.b.firstblock)));
@@ -4768,7 +4764,7 @@ xfs_bmap_del_extent(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
xfs_extnum_t *idx, /* extent number to update/delete */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
+ struct xfs_defer_ops *dfops, /* list of extents to be freed */
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
@@ -4870,6 +4866,7 @@ xfs_bmap_del_extent(
nblks = 0;
do_fx = 0;
}
+
/*
* Set flag value to use in switch statement.
* Left-contig is 2, right-contig is 1.
@@ -5052,12 +5049,20 @@ xfs_bmap_del_extent(
++*idx;
break;
}
+
+ /* remove reverse mapping */
+ if (!delay) {
+ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del);
+ if (error)
+ goto done;
+ }
+
/*
* If we need to, add to list of extents to delete.
*/
if (do_fx)
- xfs_bmap_add_free(mp, flist, del->br_startblock,
- del->br_blockcount);
+ xfs_bmap_add_free(mp, dfops, del->br_startblock,
+ del->br_blockcount, NULL);
/*
* Adjust inode # blocks in the file.
*/
@@ -5097,7 +5102,7 @@ xfs_bunmapi(
xfs_extnum_t nexts, /* number of extents max */
xfs_fsblock_t *firstblock, /* first allocated block
controls a.g. for allocs */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
+ struct xfs_defer_ops *dfops, /* i/o: list extents to free */
int *done) /* set if not done yet */
{
xfs_btree_cur_t *cur; /* bmap btree cursor */
@@ -5170,7 +5175,7 @@ xfs_bunmapi(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
- cur->bc_private.b.flist = flist;
+ cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = 0;
} else
cur = NULL;
@@ -5179,8 +5184,10 @@ xfs_bunmapi(
/*
* Synchronize by locking the bitmap inode.
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+ xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
}
extno = 0;
@@ -5262,7 +5269,7 @@ xfs_bunmapi(
}
del.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent_unwritten_real(tp, ip,
- &lastx, &cur, &del, firstblock, flist,
+ &lastx, &cur, &del, firstblock, dfops,
&logflags);
if (error)
goto error0;
@@ -5321,7 +5328,7 @@ xfs_bunmapi(
lastx--;
error = xfs_bmap_add_extent_unwritten_real(tp,
ip, &lastx, &cur, &prev,
- firstblock, flist, &logflags);
+ firstblock, dfops, &logflags);
if (error)
goto error0;
goto nodelete;
@@ -5330,7 +5337,7 @@ xfs_bunmapi(
del.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent_unwritten_real(tp,
ip, &lastx, &cur, &del,
- firstblock, flist, &logflags);
+ firstblock, dfops, &logflags);
if (error)
goto error0;
goto nodelete;
@@ -5388,7 +5395,7 @@ xfs_bunmapi(
} else if (cur)
cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
- error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+ error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del,
&tmp_logflags, whichfork);
logflags |= tmp_logflags;
if (error)
@@ -5422,7 +5429,7 @@ nodelete:
*/
if (xfs_bmap_needs_btree(ip, whichfork)) {
ASSERT(cur == NULL);
- error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+ error = xfs_bmap_extents_to_btree(tp, ip, firstblock, dfops,
&cur, 0, &tmp_logflags, whichfork);
logflags |= tmp_logflags;
if (error)
@@ -5589,7 +5596,8 @@ xfs_bmse_shift_one(
struct xfs_bmbt_rec_host *gotp,
struct xfs_btree_cur *cur,
int *logflags,
- enum shift_direction direction)
+ enum shift_direction direction,
+ struct xfs_defer_ops *dfops)
{
struct xfs_ifork *ifp;
struct xfs_mount *mp;
@@ -5637,9 +5645,13 @@ xfs_bmse_shift_one(
/* check whether to merge the extent or shift it down */
if (xfs_bmse_can_merge(&adj_irec, &got,
offset_shift_fsb)) {
- return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
- *current_ext, gotp, adj_irecp,
- cur, logflags);
+ error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
+ *current_ext, gotp, adj_irecp,
+ cur, logflags);
+ if (error)
+ return error;
+ adj_irec = got;
+ goto update_rmap;
}
} else {
startoff = got.br_startoff + offset_shift_fsb;
@@ -5676,9 +5688,10 @@ update_current_ext:
(*current_ext)--;
xfs_bmbt_set_startoff(gotp, startoff);
*logflags |= XFS_ILOG_CORE;
+ adj_irec = got;
if (!cur) {
*logflags |= XFS_ILOG_DEXT;
- return 0;
+ goto update_rmap;
}
error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
@@ -5688,8 +5701,18 @@ update_current_ext:
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
got.br_startoff = startoff;
- return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
- got.br_blockcount, got.br_state);
+ error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
+ got.br_blockcount, got.br_state);
+ if (error)
+ return error;
+
+update_rmap:
+ /* update reverse mapping */
+ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &adj_irec);
+ if (error)
+ return error;
+ adj_irec.br_startoff = startoff;
+ return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &adj_irec);
}
/*
@@ -5711,7 +5734,7 @@ xfs_bmap_shift_extents(
int *done,
xfs_fileoff_t stop_fsb,
xfs_fsblock_t *firstblock,
- struct xfs_bmap_free *flist,
+ struct xfs_defer_ops *dfops,
enum shift_direction direction,
int num_exts)
{
@@ -5756,7 +5779,7 @@ xfs_bmap_shift_extents(
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
- cur->bc_private.b.flist = flist;
+ cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = 0;
}
@@ -5817,7 +5840,7 @@ xfs_bmap_shift_extents(
while (nexts++ < num_exts) {
error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
&current_ext, gotp, cur, &logflags,
- direction);
+ direction, dfops);
if (error)
goto del_cursor;
/*
@@ -5865,7 +5888,7 @@ xfs_bmap_split_extent_at(
struct xfs_inode *ip,
xfs_fileoff_t split_fsb,
xfs_fsblock_t *firstfsb,
- struct xfs_bmap_free *free_list)
+ struct xfs_defer_ops *dfops)
{
int whichfork = XFS_DATA_FORK;
struct xfs_btree_cur *cur = NULL;
@@ -5927,7 +5950,7 @@ xfs_bmap_split_extent_at(
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstfsb;
- cur->bc_private.b.flist = free_list;
+ cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = 0;
error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
got.br_startblock,
@@ -5980,7 +6003,7 @@ xfs_bmap_split_extent_at(
int tmp_logflags; /* partial log flag return val */
ASSERT(cur == NULL);
- error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
+ error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, dfops,
&cur, 0, &tmp_logflags, whichfork);
logflags |= tmp_logflags;
}
@@ -6004,7 +6027,7 @@ xfs_bmap_split_extent(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
- struct xfs_bmap_free free_list;
+ struct xfs_defer_ops dfops;
xfs_fsblock_t firstfsb;
int error;
@@ -6016,21 +6039,21 @@ xfs_bmap_split_extent(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_bmap_init(&free_list, &firstfsb);
+ xfs_defer_init(&dfops, &firstfsb);
error = xfs_bmap_split_extent_at(tp, ip, split_fsb,
- &firstfsb, &free_list);
+ &firstfsb, &dfops);
if (error)
goto out;
- error = xfs_bmap_finish(&tp, &free_list, NULL);
+ error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto out;
return xfs_trans_commit(tp);
out:
- xfs_bmap_cancel(&free_list);
+ xfs_defer_cancel(&dfops);
xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index f1f3ae6c0a3f..254034f96941 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -32,7 +32,7 @@ extern kmem_zone_t *xfs_bmap_free_item_zone;
*/
struct xfs_bmalloca {
xfs_fsblock_t *firstblock; /* i/o first block allocated */
- struct xfs_bmap_free *flist; /* bmap freelist */
+ struct xfs_defer_ops *dfops; /* bmap freelist */
struct xfs_trans *tp; /* transaction pointer */
struct xfs_inode *ip; /* incore inode pointer */
struct xfs_bmbt_irec prev; /* extent before the new one */
@@ -62,34 +62,14 @@ struct xfs_bmalloca {
* List of extents to be free "later".
* The list is kept sorted on xbf_startblock.
*/
-struct xfs_bmap_free_item
+struct xfs_extent_free_item
{
- xfs_fsblock_t xbfi_startblock;/* starting fs block number */
- xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */
- struct list_head xbfi_list;
+ xfs_fsblock_t xefi_startblock;/* starting fs block number */
+ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ struct list_head xefi_list;
+ struct xfs_owner_info xefi_oinfo; /* extent owner */
};
-/*
- * Header for free extent list.
- *
- * xbf_low is used by the allocator to activate the lowspace algorithm -
- * when free space is running low the extent allocator may choose to
- * allocate an extent from an AG without leaving sufficient space for
- * a btree split when inserting the new extent. In this case the allocator
- * will enable the lowspace algorithm which is supposed to allow further
- * allocations (such as btree splits and newroots) to allocate from
- * sequential AGs. In order to avoid locking AGs out of order the lowspace
- * algorithm will start searching for free space from AG 0. If the correct
- * transaction reservations have been made then this algorithm will eventually
- * find all the space it needs.
- */
-typedef struct xfs_bmap_free
-{
- struct list_head xbf_flist; /* list of to-be-free extents */
- int xbf_count; /* count of items on list */
- int xbf_low; /* alloc in low mode */
-} xfs_bmap_free_t;
-
#define XFS_BMAP_MAX_NMAP 4
/*
@@ -139,14 +119,6 @@ static inline int xfs_bmapi_aflag(int w)
#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL)
#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL)
-static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
-{
- INIT_LIST_HEAD(&flp->xbf_flist);
- flp->xbf_count = 0;
- flp->xbf_low = 0;
- *fbp = NULLFSBLOCK;
-}
-
/*
* Flags for xfs_bmap_add_extent*.
*/
@@ -193,11 +165,9 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist,
- xfs_fsblock_t bno, xfs_filblks_t len);
-void xfs_bmap_cancel(struct xfs_bmap_free *flist);
-int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
- struct xfs_inode *ip);
+void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ xfs_fsblock_t bno, xfs_filblks_t len,
+ struct xfs_owner_info *oinfo);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -218,18 +188,18 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
xfs_fsblock_t *firstblock, xfs_extlen_t total,
struct xfs_bmbt_irec *mval, int *nmap,
- struct xfs_bmap_free *flist);
+ struct xfs_defer_ops *dfops);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
- struct xfs_bmap_free *flist, int *done);
+ struct xfs_defer_ops *dfops, int *done);
int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
- struct xfs_bmap_free *flist, enum shift_direction direction,
+ struct xfs_defer_ops *dfops, enum shift_direction direction,
int num_exts);
int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index db0c71e470c9..cd85274e810c 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -23,6 +23,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
@@ -34,6 +35,7 @@
#include "xfs_quota.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_rmap.h"
/*
* Determine the extent state.
@@ -406,11 +408,11 @@ xfs_bmbt_dup_cursor(
cur->bc_private.b.ip, cur->bc_private.b.whichfork);
/*
- * Copy the firstblock, flist, and flags values,
+ * Copy the firstblock, dfops, and flags values,
* since init cursor doesn't get them.
*/
new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
- new->bc_private.b.flist = cur->bc_private.b.flist;
+ new->bc_private.b.dfops = cur->bc_private.b.dfops;
new->bc_private.b.flags = cur->bc_private.b.flags;
return new;
@@ -423,7 +425,7 @@ xfs_bmbt_update_cursor(
{
ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
(dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
- ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+ ASSERT(dst->bc_private.b.dfops == src->bc_private.b.dfops);
dst->bc_private.b.allocated += src->bc_private.b.allocated;
dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
@@ -446,6 +448,8 @@ xfs_bmbt_alloc_block(
args.mp = cur->bc_mp;
args.fsbno = cur->bc_private.b.firstblock;
args.firstblock = args.fsbno;
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino,
+ cur->bc_private.b.whichfork);
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
@@ -462,7 +466,7 @@ xfs_bmbt_alloc_block(
* block allocation here and corrupt the filesystem.
*/
args.minleft = args.tp->t_blk_res;
- } else if (cur->bc_private.b.flist->xbf_low) {
+ } else if (cur->bc_private.b.dfops->dop_low) {
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -490,7 +494,7 @@ xfs_bmbt_alloc_block(
error = xfs_alloc_vextent(&args);
if (error)
goto error0;
- cur->bc_private.b.flist->xbf_low = 1;
+ cur->bc_private.b.dfops->dop_low = true;
}
if (args.fsbno == NULLFSBLOCK) {
XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
@@ -525,8 +529,10 @@ xfs_bmbt_free_block(
struct xfs_inode *ip = cur->bc_private.b.ip;
struct xfs_trans *tp = cur->bc_tp;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+ struct xfs_owner_info oinfo;
- xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1);
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork);
+ xfs_bmap_add_free(mp, cur->bc_private.b.dfops, fsbno, 1, &oinfo);
ip->i_d.di_nblocks--;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -600,17 +606,6 @@ xfs_bmbt_init_key_from_rec(
}
STATIC void
-xfs_bmbt_init_rec_from_key(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
-{
- ASSERT(key->bmbt.br_startoff != 0);
-
- xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
- 0, 0, XFS_EXT_NORM);
-}
-
-STATIC void
xfs_bmbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
@@ -760,7 +755,6 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.get_minrecs = xfs_bmbt_get_minrecs,
.get_dmaxrecs = xfs_bmbt_get_dmaxrecs,
.init_key_from_rec = xfs_bmbt_init_key_from_rec,
- .init_rec_from_key = xfs_bmbt_init_rec_from_key,
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
.key_diff = xfs_bmbt_key_diff,
@@ -800,7 +794,7 @@ xfs_bmbt_init_cursor(
cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
cur->bc_private.b.ip = ip;
cur->bc_private.b.firstblock = NULLFSBLOCK;
- cur->bc_private.b.flist = NULL;
+ cur->bc_private.b.dfops = NULL;
cur->bc_private.b.allocated = 0;
cur->bc_private.b.flags = 0;
cur->bc_private.b.whichfork = whichfork;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 07eeb0b4ca74..b5c213a051cd 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -23,6 +23,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
@@ -43,15 +44,14 @@ kmem_zone_t *xfs_btree_cur_zone;
* Btree magic numbers.
*/
static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
- { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+ { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
XFS_FIBT_MAGIC },
- { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+ { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
};
#define xfs_btree_magic(cur) \
xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
-
STATIC int /* error (0 or EFSCORRUPTED) */
xfs_btree_check_lblock(
struct xfs_btree_cur *cur, /* btree cursor */
@@ -428,6 +428,50 @@ xfs_btree_dup_cursor(
* into a btree block (xfs_btree_*_offset) or return a pointer to the given
* record, key or pointer (xfs_btree_*_addr). Note that all addressing
* inside the btree block is done using indices starting at one, not zero!
+ *
+ * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
+ * overlapping intervals. In such a tree, records are still sorted lowest to
+ * highest and indexed by the smallest key value that refers to the record.
+ * However, nodes are different: each pointer has two associated keys -- one
+ * indexing the lowest key available in the block(s) below (the same behavior
+ * as the key in a regular btree) and another indexing the highest key
+ * available in the block(s) below. Because records are /not/ sorted by the
+ * highest key, all leaf block updates require us to compute the highest key
+ * that matches any record in the leaf and to recursively update the high keys
+ * in the nodes going further up in the tree, if necessary. Nodes look like
+ * this:
+ *
+ * +--------+-----+-----+-----+-----+-----+-------+-------+-----+
+ * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
+ * +--------+-----+-----+-----+-----+-----+-------+-------+-----+
+ *
+ * To perform an interval query on an overlapped tree, perform the usual
+ * depth-first search and use the low and high keys to decide if we can skip
+ * that particular node. If a leaf node is reached, return the records that
+ * intersect the interval. Note that an interval query may return numerous
+ * entries. For a non-overlapped tree, simply search for the record associated
+ * with the lowest key and iterate forward until a non-matching record is
+ * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
+ * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
+ * more detail.
+ *
+ * Why do we care about overlapping intervals? Let's say you have a bunch of
+ * reverse mapping records on a reflink filesystem:
+ *
+ * 1: +- file A startblock B offset C length D -----------+
+ * 2: +- file E startblock F offset G length H --------------+
+ * 3: +- file I startblock F offset J length K --+
+ * 4: +- file L... --+
+ *
+ * Now say we want to map block (B+D) into file A at offset (C+D). Ideally,
+ * we'd simply increment the length of record 1. But how do we find the record
+ * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return
+ * record 3 because the keys are ordered first by startblock. An interval
+ * query would return records 1 and 2 because they both overlap (B+D-1), and
+ * from that we can pick out record 1 as the appropriate left neighbor.
+ *
+ * In the non-overlapped case you can do a LE lookup and decrement the cursor
+ * because a record's interval must end before the next record.
*/
/*
@@ -479,6 +523,18 @@ xfs_btree_key_offset(
}
/*
+ * Calculate offset of the n-th high key in a btree block.
+ */
+STATIC size_t
+xfs_btree_high_key_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
+}
+
+/*
* Calculate offset of the n-th block pointer in a btree block.
*/
STATIC size_t
@@ -519,6 +575,19 @@ xfs_btree_key_addr(
}
/*
+ * Return a pointer to the n-th high key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_high_key_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_key *)
+ ((char *)block + xfs_btree_high_key_offset(cur, n));
+}
+
+/*
* Return a pointer to the n-th block pointer in the btree block.
*/
STATIC union xfs_btree_ptr *
@@ -1144,6 +1213,9 @@ xfs_btree_set_refs(
case XFS_BTNUM_BMAP:
xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
break;
+ case XFS_BTNUM_RMAP:
+ xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
+ break;
default:
ASSERT(0);
}
@@ -1879,32 +1951,214 @@ error0:
return error;
}
+/* Find the high key storage area from a regular key. */
+STATIC union xfs_btree_key *
+xfs_btree_high_key_from_key(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+ return (union xfs_btree_key *)((char *)key +
+ (cur->bc_ops->key_len / 2));
+}
+
+/* Determine the low (and high if overlapped) keys of a leaf block */
+STATIC void
+xfs_btree_get_leaf_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *key)
+{
+ union xfs_btree_key max_hkey;
+ union xfs_btree_key hkey;
+ union xfs_btree_rec *rec;
+ union xfs_btree_key *high;
+ int n;
+
+ rec = xfs_btree_rec_addr(cur, 1, block);
+ cur->bc_ops->init_key_from_rec(key, rec);
+
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+
+ cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
+ for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
+ rec = xfs_btree_rec_addr(cur, n, block);
+ cur->bc_ops->init_high_key_from_rec(&hkey, rec);
+ if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
+ > 0)
+ max_hkey = hkey;
+ }
+
+ high = xfs_btree_high_key_from_key(cur, key);
+ memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
+ }
+}
+
+/* Determine the low (and high if overlapped) keys of a node block */
+STATIC void
+xfs_btree_get_node_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *key)
+{
+ union xfs_btree_key *hkey;
+ union xfs_btree_key *max_hkey;
+ union xfs_btree_key *high;
+ int n;
+
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ memcpy(key, xfs_btree_key_addr(cur, 1, block),
+ cur->bc_ops->key_len / 2);
+
+ max_hkey = xfs_btree_high_key_addr(cur, 1, block);
+ for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
+ hkey = xfs_btree_high_key_addr(cur, n, block);
+ if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
+ max_hkey = hkey;
+ }
+
+ high = xfs_btree_high_key_from_key(cur, key);
+ memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
+ } else {
+ memcpy(key, xfs_btree_key_addr(cur, 1, block),
+ cur->bc_ops->key_len);
+ }
+}
+
+/* Derive the keys for any btree block. */
+STATIC void
+xfs_btree_get_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *key)
+{
+ if (be16_to_cpu(block->bb_level) == 0)
+ xfs_btree_get_leaf_keys(cur, block, key);
+ else
+ xfs_btree_get_node_keys(cur, block, key);
+}
+
/*
- * Update keys at all levels from here to the root along the cursor's path.
+ * Decide if we need to update the parent keys of a btree block. For
+ * a standard btree this is only necessary if we're updating the first
+ * record/key. For an overlapping btree, we must always update the
+ * keys because the highest key can be in any of the records or keys
+ * in the block.
+ */
+static inline bool
+xfs_btree_needs_key_update(
+ struct xfs_btree_cur *cur,
+ int ptr)
+{
+ return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
+}
+
+/*
+ * Update the low and high parent keys of the given level, progressing
+ * towards the root. If force_all is false, stop if the keys for a given
+ * level do not need updating.
*/
STATIC int
-xfs_btree_updkey(
+__xfs_btree_updkeys(
+ struct xfs_btree_cur *cur,
+ int level,
+ struct xfs_btree_block *block,
+ struct xfs_buf *bp0,
+ bool force_all)
+{
+ union xfs_btree_bigkey key; /* keys from current level */
+ union xfs_btree_key *lkey; /* keys from the next level up */
+ union xfs_btree_key *hkey;
+ union xfs_btree_key *nlkey; /* keys from the next level up */
+ union xfs_btree_key *nhkey;
+ struct xfs_buf *bp;
+ int ptr;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+
+ /* Exit if there aren't any parent levels to update. */
+ if (level + 1 >= cur->bc_nlevels)
+ return 0;
+
+ trace_xfs_btree_updkeys(cur, level, bp0);
+
+ lkey = (union xfs_btree_key *)&key;
+ hkey = xfs_btree_high_key_from_key(cur, lkey);
+ xfs_btree_get_keys(cur, block, lkey);
+ for (level++; level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+ int error;
+#endif
+ block = xfs_btree_get_block(cur, level, &bp);
+ trace_xfs_btree_updkeys(cur, level, bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+#endif
+ ptr = cur->bc_ptrs[level];
+ nlkey = xfs_btree_key_addr(cur, ptr, block);
+ nhkey = xfs_btree_high_key_addr(cur, ptr, block);
+ if (!force_all &&
+ !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
+ cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
+ break;
+ xfs_btree_copy_keys(cur, nlkey, lkey, 1);
+ xfs_btree_log_keys(cur, bp, ptr, ptr);
+ if (level + 1 >= cur->bc_nlevels)
+ break;
+ xfs_btree_get_node_keys(cur, block, lkey);
+ }
+
+ return 0;
+}
+
+/* Update all the keys from some level in cursor back to the root. */
+STATIC int
+xfs_btree_updkeys_force(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfs_buf *bp;
+ struct xfs_btree_block *block;
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ return __xfs_btree_updkeys(cur, level, block, bp, true);
+}
+
+/*
+ * Update the parent keys of the given level, progressing towards the root.
+ */
+STATIC int
+xfs_btree_update_keys(
struct xfs_btree_cur *cur,
- union xfs_btree_key *keyp,
int level)
{
struct xfs_btree_block *block;
struct xfs_buf *bp;
union xfs_btree_key *kp;
+ union xfs_btree_key key;
int ptr;
+ ASSERT(level >= 0);
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
+ return __xfs_btree_updkeys(cur, level, block, bp, false);
+
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
- ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
-
/*
* Go up the tree from this level toward the root.
* At each level, update the key value to the value input.
* Stop when we reach a level where the cursor isn't pointing
* at the first entry in the block.
*/
- for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+ xfs_btree_get_keys(cur, block, &key);
+ for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
#ifdef DEBUG
int error;
#endif
@@ -1918,7 +2172,7 @@ xfs_btree_updkey(
#endif
ptr = cur->bc_ptrs[level];
kp = xfs_btree_key_addr(cur, ptr, block);
- xfs_btree_copy_keys(cur, kp, keyp, 1);
+ xfs_btree_copy_keys(cur, kp, &key, 1);
xfs_btree_log_keys(cur, bp, ptr, ptr);
}
@@ -1970,12 +2224,9 @@ xfs_btree_update(
ptr, LASTREC_UPDATE);
}
- /* Updating first rec in leaf. Pass new key value up to our parent. */
- if (ptr == 1) {
- union xfs_btree_key key;
-
- cur->bc_ops->init_key_from_rec(&key, rec);
- error = xfs_btree_updkey(cur, &key, 1);
+ /* Pass new key value up to our parent. */
+ if (xfs_btree_needs_key_update(cur, ptr)) {
+ error = xfs_btree_update_keys(cur, 0);
if (error)
goto error0;
}
@@ -1998,18 +2249,19 @@ xfs_btree_lshift(
int level,
int *stat) /* success/failure */
{
- union xfs_btree_key key; /* btree key */
struct xfs_buf *lbp; /* left buffer pointer */
struct xfs_btree_block *left; /* left btree block */
int lrecs; /* left record count */
struct xfs_buf *rbp; /* right buffer pointer */
struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
int rrecs; /* right record count */
union xfs_btree_ptr lptr; /* left btree pointer */
union xfs_btree_key *rkp = NULL; /* right btree key */
union xfs_btree_ptr *rpp = NULL; /* right address pointer */
union xfs_btree_rec *rrp = NULL; /* right record pointer */
int error; /* error return value */
+ int i;
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_TRACE_ARGI(cur, level);
@@ -2139,18 +2391,33 @@ xfs_btree_lshift(
xfs_btree_rec_addr(cur, 2, right),
-1, rrecs);
xfs_btree_log_recs(cur, rbp, 1, rrecs);
+ }
- /*
- * If it's the first record in the block, we'll need a key
- * structure to pass up to the next level (updkey).
- */
- cur->bc_ops->init_key_from_rec(&key,
- xfs_btree_rec_addr(cur, 1, right));
- rkp = &key;
+ /*
+ * Using a temporary cursor, update the parent key values of the
+ * block on the left.
+ */
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error1;
+
+ /* Update the parent high keys of the left block, if needed. */
+ error = xfs_btree_update_keys(tcur, level);
+ if (error)
+ goto error1;
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
}
- /* Update the parent key values of right. */
- error = xfs_btree_updkey(cur, rkp, level + 1);
+ /* Update the parent keys of the right block. */
+ error = xfs_btree_update_keys(cur, level);
if (error)
goto error0;
@@ -2169,6 +2436,11 @@ out0:
error0:
XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
+
+error1:
+ XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
}
/*
@@ -2181,7 +2453,6 @@ xfs_btree_rshift(
int level,
int *stat) /* success/failure */
{
- union xfs_btree_key key; /* btree key */
struct xfs_buf *lbp; /* left buffer pointer */
struct xfs_btree_block *left; /* left btree block */
struct xfs_buf *rbp; /* right buffer pointer */
@@ -2290,12 +2561,6 @@ xfs_btree_rshift(
/* Now put the new data in, and log it. */
xfs_btree_copy_recs(cur, rrp, lrp, 1);
xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
-
- cur->bc_ops->init_key_from_rec(&key, rrp);
- rkp = &key;
-
- ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
- xfs_btree_rec_addr(cur, 2, right)));
}
/*
@@ -2315,13 +2580,21 @@ xfs_btree_rshift(
if (error)
goto error0;
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
error = xfs_btree_increment(tcur, level, &i);
if (error)
goto error1;
- error = xfs_btree_updkey(tcur, rkp, level + 1);
+ /* Update the parent high keys of the left block, if needed. */
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ error = xfs_btree_update_keys(cur, level);
+ if (error)
+ goto error1;
+ }
+
+ /* Update the parent keys of the right block. */
+ error = xfs_btree_update_keys(tcur, level);
if (error)
goto error1;
@@ -2422,6 +2695,11 @@ __xfs_btree_split(
XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+ /* Adjust numrecs for the later get_*_keys() calls. */
+ lrecs -= rrecs;
+ xfs_btree_set_numrecs(left, lrecs);
+ xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
/*
* Copy btree block entries from the left block over to the
* new block, the right. Update the right block and log the
@@ -2447,14 +2725,15 @@ __xfs_btree_split(
}
#endif
+ /* Copy the keys & pointers to the new block. */
xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
xfs_btree_log_keys(cur, rbp, 1, rrecs);
xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
- /* Grab the keys to the entries moved to the right block */
- xfs_btree_copy_keys(cur, key, rkp, 1);
+ /* Stash the keys of the new block for later insertion. */
+ xfs_btree_get_node_keys(cur, right, key);
} else {
/* It's a leaf. Move records. */
union xfs_btree_rec *lrp; /* left record pointer */
@@ -2463,27 +2742,23 @@ __xfs_btree_split(
lrp = xfs_btree_rec_addr(cur, src_index, left);
rrp = xfs_btree_rec_addr(cur, 1, right);
+ /* Copy records to the new block. */
xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
xfs_btree_log_recs(cur, rbp, 1, rrecs);
- cur->bc_ops->init_key_from_rec(key,
- xfs_btree_rec_addr(cur, 1, right));
+ /* Stash the keys of the new block for later insertion. */
+ xfs_btree_get_leaf_keys(cur, right, key);
}
-
/*
* Find the left block number by looking in the buffer.
- * Adjust numrecs, sibling pointers.
+ * Adjust sibling pointers.
*/
xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
- lrecs -= rrecs;
- xfs_btree_set_numrecs(left, lrecs);
- xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
-
xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
@@ -2499,6 +2774,14 @@ __xfs_btree_split(
xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
}
+
+ /* Update the parent high keys of the left block, if needed. */
+ if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ error = xfs_btree_update_keys(cur, level);
+ if (error)
+ goto error0;
+ }
+
/*
* If the cursor is really in the right block, move it there.
* If it's just pointing past the last entry in left, then we'll
@@ -2802,6 +3085,7 @@ xfs_btree_new_root(
bp = lbp;
nptr = 2;
}
+
/* Fill in the new block's btree header and log it. */
xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
@@ -2810,19 +3094,24 @@ xfs_btree_new_root(
/* Fill in the key data in the new root. */
if (xfs_btree_get_level(left) > 0) {
- xfs_btree_copy_keys(cur,
- xfs_btree_key_addr(cur, 1, new),
- xfs_btree_key_addr(cur, 1, left), 1);
- xfs_btree_copy_keys(cur,
- xfs_btree_key_addr(cur, 2, new),
- xfs_btree_key_addr(cur, 1, right), 1);
+ /*
+ * Get the keys for the left block's keys and put them directly
+ * in the parent block. Do the same for the right block.
+ */
+ xfs_btree_get_node_keys(cur, left,
+ xfs_btree_key_addr(cur, 1, new));
+ xfs_btree_get_node_keys(cur, right,
+ xfs_btree_key_addr(cur, 2, new));
} else {
- cur->bc_ops->init_key_from_rec(
- xfs_btree_key_addr(cur, 1, new),
- xfs_btree_rec_addr(cur, 1, left));
- cur->bc_ops->init_key_from_rec(
- xfs_btree_key_addr(cur, 2, new),
- xfs_btree_rec_addr(cur, 1, right));
+ /*
+ * Get the keys for the left block's records and put them
+ * directly in the parent block. Do the same for the right
+ * block.
+ */
+ xfs_btree_get_leaf_keys(cur, left,
+ xfs_btree_key_addr(cur, 1, new));
+ xfs_btree_get_leaf_keys(cur, right,
+ xfs_btree_key_addr(cur, 2, new));
}
xfs_btree_log_keys(cur, nbp, 1, 2);
@@ -2858,10 +3147,9 @@ xfs_btree_make_block_unfull(
int *index, /* new tree index */
union xfs_btree_ptr *nptr, /* new btree ptr */
struct xfs_btree_cur **ncur, /* new btree cursor */
- union xfs_btree_rec *nrec, /* new record */
+ union xfs_btree_key *key, /* key of new block */
int *stat)
{
- union xfs_btree_key key; /* new btree key value */
int error = 0;
if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
@@ -2871,6 +3159,7 @@ xfs_btree_make_block_unfull(
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
/* A root block that can be made bigger. */
xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+ *stat = 1;
} else {
/* A root block that needs replacing */
int logflags = 0;
@@ -2906,13 +3195,12 @@ xfs_btree_make_block_unfull(
* If this works we have to re-set our variables because we
* could be in a different block now.
*/
- error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+ error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
if (error || *stat == 0)
return error;
*index = cur->bc_ptrs[level];
- cur->bc_ops->init_rec_from_key(&key, nrec);
return 0;
}
@@ -2925,16 +3213,17 @@ xfs_btree_insrec(
struct xfs_btree_cur *cur, /* btree cursor */
int level, /* level to insert record at */
union xfs_btree_ptr *ptrp, /* i/o: block number inserted */
- union xfs_btree_rec *recp, /* i/o: record data inserted */
+ union xfs_btree_rec *rec, /* record to insert */
+ union xfs_btree_key *key, /* i/o: block key for ptrp */
struct xfs_btree_cur **curp, /* output: new cursor replacing cur */
int *stat) /* success/failure */
{
struct xfs_btree_block *block; /* btree block */
struct xfs_buf *bp; /* buffer for block */
- union xfs_btree_key key; /* btree key */
union xfs_btree_ptr nptr; /* new block ptr */
struct xfs_btree_cur *ncur; /* new btree cursor */
- union xfs_btree_rec nrec; /* new record count */
+ union xfs_btree_bigkey nkey; /* new block key */
+ union xfs_btree_key *lkey;
int optr; /* old key/record index */
int ptr; /* key/record index */
int numrecs;/* number of records */
@@ -2942,11 +3231,13 @@ xfs_btree_insrec(
#ifdef DEBUG
int i;
#endif
+ xfs_daddr_t old_bn;
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+ XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
ncur = NULL;
+ lkey = (union xfs_btree_key *)&nkey;
/*
* If we have an external root pointer, and we've made it to the
@@ -2969,15 +3260,13 @@ xfs_btree_insrec(
return 0;
}
- /* Make a key out of the record data to be inserted, and save it. */
- cur->bc_ops->init_key_from_rec(&key, recp);
-
optr = ptr;
XFS_BTREE_STATS_INC(cur, insrec);
/* Get pointers to the btree buffer and block. */
block = xfs_btree_get_block(cur, level, &bp);
+ old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
numrecs = xfs_btree_get_numrecs(block);
#ifdef DEBUG
@@ -2988,10 +3277,10 @@ xfs_btree_insrec(
/* Check that the new entry is being inserted in the right place. */
if (ptr <= numrecs) {
if (level == 0) {
- ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+ ASSERT(cur->bc_ops->recs_inorder(cur, rec,
xfs_btree_rec_addr(cur, ptr, block)));
} else {
- ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+ ASSERT(cur->bc_ops->keys_inorder(cur, key,
xfs_btree_key_addr(cur, ptr, block)));
}
}
@@ -3004,7 +3293,7 @@ xfs_btree_insrec(
xfs_btree_set_ptr_null(cur, &nptr);
if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
error = xfs_btree_make_block_unfull(cur, level, numrecs,
- &optr, &ptr, &nptr, &ncur, &nrec, stat);
+ &optr, &ptr, &nptr, &ncur, lkey, stat);
if (error || *stat == 0)
goto error0;
}
@@ -3054,7 +3343,7 @@ xfs_btree_insrec(
#endif
/* Now put the new data in, bump numrecs and log it. */
- xfs_btree_copy_keys(cur, kp, &key, 1);
+ xfs_btree_copy_keys(cur, kp, key, 1);
xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
numrecs++;
xfs_btree_set_numrecs(block, numrecs);
@@ -3075,7 +3364,7 @@ xfs_btree_insrec(
xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
/* Now put the new data in, bump numrecs and log it. */
- xfs_btree_copy_recs(cur, rp, recp, 1);
+ xfs_btree_copy_recs(cur, rp, rec, 1);
xfs_btree_set_numrecs(block, ++numrecs);
xfs_btree_log_recs(cur, bp, ptr, numrecs);
#ifdef DEBUG
@@ -3089,9 +3378,18 @@ xfs_btree_insrec(
/* Log the new number of records in the btree header. */
xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
- /* If we inserted at the start of a block, update the parents' keys. */
- if (optr == 1) {
- error = xfs_btree_updkey(cur, &key, level + 1);
+ /*
+ * If we just inserted into a new tree block, we have to
+ * recalculate nkey here because nkey is out of date.
+ *
+ * Otherwise we're just updating an existing block (having shoved
+ * some records into the new tree block), so use the regular key
+ * update mechanism.
+ */
+ if (bp && bp->b_bn != old_bn) {
+ xfs_btree_get_keys(cur, block, lkey);
+ } else if (xfs_btree_needs_key_update(cur, optr)) {
+ error = xfs_btree_update_keys(cur, level);
if (error)
goto error0;
}
@@ -3101,7 +3399,7 @@ xfs_btree_insrec(
* we are at the far right edge of the tree, update it.
*/
if (xfs_btree_is_lastrec(cur, block, level)) {
- cur->bc_ops->update_lastrec(cur, block, recp,
+ cur->bc_ops->update_lastrec(cur, block, rec,
ptr, LASTREC_INSREC);
}
@@ -3111,7 +3409,7 @@ xfs_btree_insrec(
*/
*ptrp = nptr;
if (!xfs_btree_ptr_is_null(cur, &nptr)) {
- *recp = nrec;
+ xfs_btree_copy_keys(cur, key, lkey, 1);
*curp = ncur;
}
@@ -3142,14 +3440,20 @@ xfs_btree_insert(
union xfs_btree_ptr nptr; /* new block number (split result) */
struct xfs_btree_cur *ncur; /* new cursor (split result) */
struct xfs_btree_cur *pcur; /* previous level's cursor */
+ union xfs_btree_bigkey bkey; /* key of block to insert */
+ union xfs_btree_key *key;
union xfs_btree_rec rec; /* record to insert */
level = 0;
ncur = NULL;
pcur = cur;
+ key = (union xfs_btree_key *)&bkey;
xfs_btree_set_ptr_null(cur, &nptr);
+
+ /* Make a key out of the record data to be inserted, and save it. */
cur->bc_ops->init_rec_from_cur(cur, &rec);
+ cur->bc_ops->init_key_from_rec(key, &rec);
/*
* Loop going up the tree, starting at the leaf level.
@@ -3161,7 +3465,8 @@ xfs_btree_insert(
* Insert nrec/nptr into this level of the tree.
* Note if we fail, nptr will be null.
*/
- error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+ error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
+ &ncur, &i);
if (error) {
if (pcur != cur)
xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
@@ -3385,8 +3690,6 @@ xfs_btree_delrec(
struct xfs_buf *bp; /* buffer for block */
int error; /* error return value */
int i; /* loop counter */
- union xfs_btree_key key; /* storage for keyp */
- union xfs_btree_key *keyp = &key; /* passed to the next level */
union xfs_btree_ptr lptr; /* left sibling block ptr */
struct xfs_buf *lbp; /* left buffer pointer */
struct xfs_btree_block *left; /* left btree block */
@@ -3457,13 +3760,6 @@ xfs_btree_delrec(
xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
}
-
- /*
- * If it's the first record in the block, we'll need to pass a
- * key up to the next level (updkey).
- */
- if (ptr == 1)
- keyp = xfs_btree_key_addr(cur, 1, block);
} else {
/* It's a leaf. operate on records */
if (ptr < numrecs) {
@@ -3472,16 +3768,6 @@ xfs_btree_delrec(
-1, numrecs - ptr);
xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
}
-
- /*
- * If it's the first record in the block, we'll need a key
- * structure to pass up to the next level (updkey).
- */
- if (ptr == 1) {
- cur->bc_ops->init_key_from_rec(&key,
- xfs_btree_rec_addr(cur, 1, block));
- keyp = &key;
- }
}
/*
@@ -3548,8 +3834,8 @@ xfs_btree_delrec(
* If we deleted the leftmost entry in the block, update the
* key values above us in the tree.
*/
- if (ptr == 1) {
- error = xfs_btree_updkey(cur, keyp, level + 1);
+ if (xfs_btree_needs_key_update(cur, ptr)) {
+ error = xfs_btree_update_keys(cur, level);
if (error)
goto error0;
}
@@ -3878,6 +4164,16 @@ xfs_btree_delrec(
if (level > 0)
cur->bc_ptrs[level]--;
+ /*
+ * We combined blocks, so we have to update the parent keys if the
+ * btree supports overlapped intervals. However, bc_ptrs[level + 1]
+ * points to the old block so that the caller knows which record to
+ * delete. Therefore, the caller must be savvy enough to call updkeys
+ * for us if we return stat == 2. The other exit points from this
+ * function don't require deletions further up the tree, so they can
+ * call updkeys directly.
+ */
+
XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
/* Return value means the next level up has something to do. */
*stat = 2;
@@ -3903,6 +4199,7 @@ xfs_btree_delete(
int error; /* error return value */
int level;
int i;
+ bool joined = false;
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
@@ -3916,6 +4213,18 @@ xfs_btree_delete(
error = xfs_btree_delrec(cur, level, &i);
if (error)
goto error0;
+ if (i == 2)
+ joined = true;
+ }
+
+ /*
+ * If we combined blocks as part of deleting the record, delrec won't
+ * have updated the parent high keys so we have to do that here.
+ */
+ if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
+ error = xfs_btree_updkeys_force(cur, 0);
+ if (error)
+ goto error0;
}
if (i == 0) {
@@ -3978,6 +4287,81 @@ xfs_btree_get_rec(
return 0;
}
+/* Visit a block in a btree. */
+STATIC int
+xfs_btree_visit_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_btree_visit_blocks_fn fn,
+ void *data)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_ptr rptr;
+ int error;
+
+ /* do right sibling readahead */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ /* process the block */
+ error = fn(cur, level, data);
+ if (error)
+ return error;
+
+ /* now read rh sibling block for next iteration */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ return -ENOENT;
+
+ return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+
+/* Visit every block in a btree. */
+int
+xfs_btree_visit_blocks(
+ struct xfs_btree_cur *cur,
+ xfs_btree_visit_blocks_fn fn,
+ void *data)
+{
+ union xfs_btree_ptr lptr;
+ int level;
+ struct xfs_btree_block *block = NULL;
+ int error = 0;
+
+ cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+ /* for each level */
+ for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+ /* grab the left hand block */
+ error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+ if (error)
+ return error;
+
+ /* readahead the left most block for the next level down */
+ if (level > 0) {
+ union xfs_btree_ptr *ptr;
+
+ ptr = xfs_btree_ptr_addr(cur, 1, block);
+ xfs_btree_readahead_ptr(cur, ptr, 1);
+
+ /* save for the next iteration of the loop */
+ lptr = *ptr;
+ }
+
+ /* for each buffer in the level */
+ do {
+ error = xfs_btree_visit_block(cur, level, fn, data);
+ } while (!error);
+
+ if (error != -ENOENT)
+ return error;
+ }
+
+ return 0;
+}
+
/*
* Change the owner of a btree.
*
@@ -4002,26 +4386,27 @@ xfs_btree_get_rec(
* just queue the modified buffer as delayed write buffer so the transaction
* recovery completion writes the changes to disk.
*/
+struct xfs_btree_block_change_owner_info {
+ __uint64_t new_owner;
+ struct list_head *buffer_list;
+};
+
static int
xfs_btree_block_change_owner(
struct xfs_btree_cur *cur,
int level,
- __uint64_t new_owner,
- struct list_head *buffer_list)
+ void *data)
{
+ struct xfs_btree_block_change_owner_info *bbcoi = data;
struct xfs_btree_block *block;
struct xfs_buf *bp;
- union xfs_btree_ptr rptr;
-
- /* do right sibling readahead */
- xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
/* modify the owner */
block = xfs_btree_get_block(cur, level, &bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+ block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
else
- block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+ block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
/*
* If the block is a root block hosted in an inode, we might not have a
@@ -4035,19 +4420,14 @@ xfs_btree_block_change_owner(
xfs_trans_ordered_buf(cur->bc_tp, bp);
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
} else {
- xfs_buf_delwri_queue(bp, buffer_list);
+ xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
}
} else {
ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
ASSERT(level == cur->bc_nlevels - 1);
}
- /* now read rh sibling block for next iteration */
- xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
- if (xfs_btree_ptr_is_null(cur, &rptr))
- return -ENOENT;
-
- return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+ return 0;
}
int
@@ -4056,43 +4436,13 @@ xfs_btree_change_owner(
__uint64_t new_owner,
struct list_head *buffer_list)
{
- union xfs_btree_ptr lptr;
- int level;
- struct xfs_btree_block *block = NULL;
- int error = 0;
-
- cur->bc_ops->init_ptr_from_cur(cur, &lptr);
-
- /* for each level */
- for (level = cur->bc_nlevels - 1; level >= 0; level--) {
- /* grab the left hand block */
- error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
- if (error)
- return error;
-
- /* readahead the left most block for the next level down */
- if (level > 0) {
- union xfs_btree_ptr *ptr;
-
- ptr = xfs_btree_ptr_addr(cur, 1, block);
- xfs_btree_readahead_ptr(cur, ptr, 1);
-
- /* save for the next iteration of the loop */
- lptr = *ptr;
- }
-
- /* for each buffer in the level */
- do {
- error = xfs_btree_block_change_owner(cur, level,
- new_owner,
- buffer_list);
- } while (!error);
+ struct xfs_btree_block_change_owner_info bbcoi;
- if (error != -ENOENT)
- return error;
- }
+ bbcoi.new_owner = new_owner;
+ bbcoi.buffer_list = buffer_list;
- return 0;
+ return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
+ &bbcoi);
}
/**
@@ -4171,3 +4521,267 @@ xfs_btree_compute_maxlevels(
maxblocks = (maxblocks + limits[1] - 1) / limits[1];
return level;
}
+
+/*
+ * Query a regular btree for all records overlapping a given interval.
+ * Start with a LE lookup of the key of low_rec and return all records
+ * until we find a record with a key greater than the key of high_rec.
+ */
+STATIC int
+xfs_btree_simple_query_range(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *low_key,
+ union xfs_btree_key *high_key,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_rec *recp;
+ union xfs_btree_key rec_key;
+ __int64_t diff;
+ int stat;
+ bool firstrec = true;
+ int error;
+
+ ASSERT(cur->bc_ops->init_high_key_from_rec);
+ ASSERT(cur->bc_ops->diff_two_keys);
+
+ /*
+ * Find the leftmost record. The btree cursor must be set
+ * to the low record used to generate low_key.
+ */
+ stat = 0;
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
+ if (error)
+ goto out;
+
+ while (stat) {
+ /* Find the record. */
+ error = xfs_btree_get_rec(cur, &recp, &stat);
+ if (error || !stat)
+ break;
+ cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
+
+ /* Skip if high_key(rec) < low_key. */
+ if (firstrec) {
+ firstrec = false;
+ diff = cur->bc_ops->diff_two_keys(cur, low_key,
+ &rec_key);
+ if (diff > 0)
+ goto advloop;
+ }
+
+ /* Stop if high_key < low_key(rec). */
+ diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
+ if (diff > 0)
+ break;
+
+ /* Callback */
+ error = fn(cur, recp, priv);
+ if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
+ break;
+
+advloop:
+ /* Move on to the next record. */
+ error = xfs_btree_increment(cur, 0, &stat);
+ if (error)
+ break;
+ }
+
+out:
+ return error;
+}
+
+/*
+ * Query an overlapped interval btree for all records overlapping a given
+ * interval. This function roughly follows the algorithm given in
+ * "Interval Trees" of _Introduction to Algorithms_, which is section
+ * 14.3 in the 2nd and 3rd editions.
+ *
+ * First, generate keys for the low and high records passed in.
+ *
+ * For any leaf node, generate the high and low keys for the record.
+ * If the record keys overlap with the query low/high keys, pass the
+ * record to the function iterator.
+ *
+ * For any internal node, compare the low and high keys of each
+ * pointer against the query low/high keys. If there's an overlap,
+ * follow the pointer.
+ *
+ * As an optimization, we stop scanning a block when we find a low key
+ * that is greater than the query's high key.
+ */
+STATIC int
+xfs_btree_overlapped_query_range(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *low_key,
+ union xfs_btree_key *high_key,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_ptr ptr;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_key rec_key;
+ union xfs_btree_key rec_hkey;
+ union xfs_btree_key *lkp;
+ union xfs_btree_key *hkp;
+ union xfs_btree_rec *recp;
+ struct xfs_btree_block *block;
+ __int64_t ldiff;
+ __int64_t hdiff;
+ int level;
+ struct xfs_buf *bp;
+ int i;
+ int error;
+
+ /* Load the root of the btree. */
+ level = cur->bc_nlevels - 1;
+ cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
+ if (error)
+ return error;
+ xfs_btree_get_block(cur, level, &bp);
+ trace_xfs_btree_overlapped_query_range(cur, level, bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto out;
+#endif
+ cur->bc_ptrs[level] = 1;
+
+ while (level < cur->bc_nlevels) {
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ /* End of node, pop back towards the root. */
+ if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
+pop_up:
+ if (level < cur->bc_nlevels - 1)
+ cur->bc_ptrs[level + 1]++;
+ level++;
+ continue;
+ }
+
+ if (level == 0) {
+ /* Handle a leaf node. */
+ recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+
+ cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
+ ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
+ low_key);
+
+ cur->bc_ops->init_key_from_rec(&rec_key, recp);
+ hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
+ &rec_key);
+
+ /*
+ * If (record's high key >= query's low key) and
+ * (query's high key >= record's low key), then
+ * this record overlaps the query range; callback.
+ */
+ if (ldiff >= 0 && hdiff >= 0) {
+ error = fn(cur, recp, priv);
+ if (error < 0 ||
+ error == XFS_BTREE_QUERY_RANGE_ABORT)
+ break;
+ } else if (hdiff < 0) {
+ /* Record is larger than high key; pop. */
+ goto pop_up;
+ }
+ cur->bc_ptrs[level]++;
+ continue;
+ }
+
+ /* Handle an internal node. */
+ lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
+ hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
+ pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
+
+ ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
+ hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
+
+ /*
+ * If (pointer's high key >= query's low key) and
+ * (query's high key >= pointer's low key), then
+ * this record overlaps the query range; follow pointer.
+ */
+ if (ldiff >= 0 && hdiff >= 0) {
+ level--;
+ error = xfs_btree_lookup_get_block(cur, level, pp,
+ &block);
+ if (error)
+ goto out;
+ xfs_btree_get_block(cur, level, &bp);
+ trace_xfs_btree_overlapped_query_range(cur, level, bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto out;
+#endif
+ cur->bc_ptrs[level] = 1;
+ continue;
+ } else if (hdiff < 0) {
+ /* The low key is larger than the upper range; pop. */
+ goto pop_up;
+ }
+ cur->bc_ptrs[level]++;
+ }
+
+out:
+ /*
+ * If we don't end this function with the cursor pointing at a record
+ * block, a subsequent non-error cursor deletion will not release
+ * node-level buffers, causing a buffer leak. This is quite possible
+ * with a zero-results range query, so release the buffers if we
+ * failed to return any results.
+ */
+ if (cur->bc_bufs[0] == NULL) {
+ for (i = 0; i < cur->bc_nlevels; i++) {
+ if (cur->bc_bufs[i]) {
+ xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+ cur->bc_bufs[i] = NULL;
+ cur->bc_ptrs[i] = 0;
+ cur->bc_ra[i] = 0;
+ }
+ }
+ }
+
+ return error;
+}
+
+/*
+ * Query a btree for all records overlapping a given interval of keys. The
+ * supplied function will be called with each record found; return one of the
+ * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
+ * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
+ * negative error code.
+ */
+int
+xfs_btree_query_range(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_irec *low_rec,
+ union xfs_btree_irec *high_rec,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_rec rec;
+ union xfs_btree_key low_key;
+ union xfs_btree_key high_key;
+
+ /* Find the keys of both ends of the interval. */
+ cur->bc_rec = *high_rec;
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+ cur->bc_ops->init_key_from_rec(&high_key, &rec);
+
+ cur->bc_rec = *low_rec;
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+ cur->bc_ops->init_key_from_rec(&low_key, &rec);
+
+ /* Enforce low key < high key. */
+ if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
+ return -EINVAL;
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return xfs_btree_simple_query_range(cur, &low_key,
+ &high_key, fn, priv);
+ return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
+ fn, priv);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 785a99682159..04d0865e5e6d 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -19,7 +19,7 @@
#define __XFS_BTREE_H__
struct xfs_buf;
-struct xfs_bmap_free;
+struct xfs_defer_ops;
struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
@@ -38,17 +38,37 @@ union xfs_btree_ptr {
};
union xfs_btree_key {
- xfs_bmbt_key_t bmbt;
- xfs_bmdr_key_t bmbr; /* bmbt root block */
- xfs_alloc_key_t alloc;
- xfs_inobt_key_t inobt;
+ struct xfs_bmbt_key bmbt;
+ xfs_bmdr_key_t bmbr; /* bmbt root block */
+ xfs_alloc_key_t alloc;
+ struct xfs_inobt_key inobt;
+ struct xfs_rmap_key rmap;
+};
+
+/*
+ * In-core key that holds both low and high keys for overlapped btrees.
+ * The two keys are packed next to each other on disk, so do the same
+ * in memory. Preserve the existing xfs_btree_key as a single key to
+ * avoid the mental model breakage that would happen if we passed a
+ * bigkey into a function that operates on a single key.
+ */
+union xfs_btree_bigkey {
+ struct xfs_bmbt_key bmbt;
+ xfs_bmdr_key_t bmbr; /* bmbt root block */
+ xfs_alloc_key_t alloc;
+ struct xfs_inobt_key inobt;
+ struct {
+ struct xfs_rmap_key rmap;
+ struct xfs_rmap_key rmap_hi;
+ };
};
union xfs_btree_rec {
- xfs_bmbt_rec_t bmbt;
- xfs_bmdr_rec_t bmbr; /* bmbt root block */
- xfs_alloc_rec_t alloc;
- xfs_inobt_rec_t inobt;
+ struct xfs_bmbt_rec bmbt;
+ xfs_bmdr_rec_t bmbr; /* bmbt root block */
+ struct xfs_alloc_rec alloc;
+ struct xfs_inobt_rec inobt;
+ struct xfs_rmap_rec rmap;
};
/*
@@ -63,6 +83,7 @@ union xfs_btree_rec {
#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
+#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
/*
* For logging record fields.
@@ -95,6 +116,7 @@ do { \
case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
+ case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -115,11 +137,13 @@ do { \
__XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
case XFS_BTNUM_FINO: \
__XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
+ case XFS_BTNUM_RMAP: \
+ __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
-#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */
+#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */
struct xfs_btree_ops {
/* size of the key and record structures */
@@ -158,17 +182,25 @@ struct xfs_btree_ops {
/* init values of btree structures */
void (*init_key_from_rec)(union xfs_btree_key *key,
union xfs_btree_rec *rec);
- void (*init_rec_from_key)(union xfs_btree_key *key,
- union xfs_btree_rec *rec);
void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_rec *rec);
void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr);
+ void (*init_high_key_from_rec)(union xfs_btree_key *key,
+ union xfs_btree_rec *rec);
/* difference between key value and cursor value */
__int64_t (*key_diff)(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
+ /*
+ * Difference between key2 and key1 -- positive if key1 > key2,
+ * negative if key1 < key2, and zero if equal.
+ */
+ __int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
+ union xfs_btree_key *key1,
+ union xfs_btree_key *key2);
+
const struct xfs_buf_ops *buf_ops;
#if defined(DEBUG) || defined(XFS_WARN)
@@ -192,6 +224,13 @@ struct xfs_btree_ops {
#define LASTREC_DELREC 2
+union xfs_btree_irec {
+ struct xfs_alloc_rec_incore a;
+ struct xfs_bmbt_irec b;
+ struct xfs_inobt_rec_incore i;
+ struct xfs_rmap_irec r;
+};
+
/*
* Btree cursor structure.
* This collects all information needed by the btree code in one place.
@@ -202,11 +241,7 @@ typedef struct xfs_btree_cur
struct xfs_mount *bc_mp; /* file system mount struct */
const struct xfs_btree_ops *bc_ops;
uint bc_flags; /* btree features - below */
- union {
- xfs_alloc_rec_incore_t a;
- xfs_bmbt_irec_t b;
- xfs_inobt_rec_incore_t i;
- } bc_rec; /* current insert/search record value */
+ union xfs_btree_irec bc_rec; /* current insert/search record value */
struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
__uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
@@ -218,11 +253,12 @@ typedef struct xfs_btree_cur
union {
struct { /* needed for BNO, CNT, INO */
struct xfs_buf *agbp; /* agf/agi buffer pointer */
+ struct xfs_defer_ops *dfops; /* deferred updates */
xfs_agnumber_t agno; /* ag number */
} a;
struct { /* needed for BMAP */
struct xfs_inode *ip; /* pointer to our inode */
- struct xfs_bmap_free *flist; /* list to free after */
+ struct xfs_defer_ops *dfops; /* deferred updates */
xfs_fsblock_t firstblock; /* 1st blk allocated */
int allocated; /* count of alloced */
short forksize; /* fork's inode space */
@@ -238,6 +274,7 @@ typedef struct xfs_btree_cur
#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
+#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
#define XFS_BTREE_NOERROR 0
@@ -477,4 +514,19 @@ bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
unsigned long len);
+/* return codes */
+#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
+#define XFS_BTREE_QUERY_RANGE_ABORT 1 /* stop iterating */
+typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec, void *priv);
+
+int xfs_btree_query_range(struct xfs_btree_cur *cur,
+ union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec,
+ xfs_btree_query_range_fn fn, void *priv);
+
+typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
+ void *data);
+int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
+ xfs_btree_visit_blocks_fn fn, void *data);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 0f1f165f4048..f2dc1a950c85 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2029,7 +2029,7 @@ xfs_da_grow_inode_int(
error = xfs_bmapi_write(tp, dp, *bno, count,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
args->firstblock, args->total, &map, &nmap,
- args->flist);
+ args->dfops);
if (error)
return error;
@@ -2052,7 +2052,7 @@ xfs_da_grow_inode_int(
error = xfs_bmapi_write(tp, dp, b, c,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
args->firstblock, args->total,
- &mapp[mapi], &nmap, args->flist);
+ &mapp[mapi], &nmap, args->dfops);
if (error)
goto out_free_map;
if (nmap < 1)
@@ -2362,7 +2362,7 @@ xfs_da_shrink_inode(
*/
error = xfs_bunmapi(tp, dp, dead_blkno, count,
xfs_bmapi_aflag(w), 0, args->firstblock,
- args->flist, &done);
+ args->dfops, &done);
if (error == -ENOSPC) {
if (w != XFS_DATA_FORK)
break;
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 6e153e399a77..98c75cbe6ac2 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -19,7 +19,7 @@
#ifndef __XFS_DA_BTREE_H__
#define __XFS_DA_BTREE_H__
-struct xfs_bmap_free;
+struct xfs_defer_ops;
struct xfs_inode;
struct xfs_trans;
struct zone;
@@ -70,7 +70,7 @@ typedef struct xfs_da_args {
xfs_ino_t inumber; /* input/output inode number */
struct xfs_inode *dp; /* directory inode to manipulate */
xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */
- struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */
+ struct xfs_defer_ops *dfops; /* ptr to freelist for bmap_finish */
struct xfs_trans *trans; /* current trans (changes over time) */
xfs_extlen_t total; /* total blocks needed, for 1st bmap */
int whichfork; /* data or attribute fork */
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 685f23b67056..9a492a9e19bd 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -629,6 +629,7 @@ typedef struct xfs_attr_shortform {
struct xfs_attr_sf_hdr { /* constant-structure header block */
__be16 totsize; /* total bytes in shortform list */
__u8 count; /* count of active entries */
+ __u8 padding;
} hdr;
struct xfs_attr_sf_entry {
__uint8_t namelen; /* actual length of name (no NULL) */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
new file mode 100644
index 000000000000..054a2032fdb3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+
+/*
+ * Deferred Operations in XFS
+ *
+ * Due to the way locking rules work in XFS, certain transactions (block
+ * mapping and unmapping, typically) have permanent reservations so that
+ * we can roll the transaction to adhere to AG locking order rules and
+ * to unlock buffers between metadata updates. Prior to rmap/reflink,
+ * the mapping code had a mechanism to perform these deferrals for
+ * extents that were going to be freed; this code makes that facility
+ * more generic.
+ *
+ * When adding the reverse mapping and reflink features, it became
+ * necessary to perform complex remapping multi-transactions to comply
+ * with AG locking order rules, and to be able to spread a single
+ * refcount update operation (an operation on an n-block extent can
+ * update as many as n records!) among multiple transactions. XFS can
+ * roll a transaction to facilitate this, but using this facility
+ * requires us to log "intent" items in case log recovery needs to
+ * redo the operation, and to log "done" items to indicate that redo
+ * is not necessary.
+ *
+ * Deferred work is tracked in xfs_defer_pending items. Each pending
+ * item tracks one type of deferred work. Incoming work items (which
+ * have not yet had an intent logged) are attached to a pending item
+ * on the dop_intake list, where they wait for the caller to finish
+ * the deferred operations.
+ *
+ * Finishing a set of deferred operations is an involved process. To
+ * start, we define "rolling a deferred-op transaction" as follows:
+ *
+ * > For each xfs_defer_pending item on the dop_intake list,
+ * - Sort the work items in AG order. XFS locking
+ * order rules require us to lock buffers in AG order.
+ * - Create a log intent item for that type.
+ * - Attach it to the pending item.
+ * - Move the pending item from the dop_intake list to the
+ * dop_pending list.
+ * > Roll the transaction.
+ *
+ * NOTE: To avoid exceeding the transaction reservation, we limit the
+ * number of items that we attach to a given xfs_defer_pending.
+ *
+ * The actual finishing process looks like this:
+ *
+ * > For each xfs_defer_pending in the dop_pending list,
+ * - Roll the deferred-op transaction as above.
+ * - Create a log done item for that type, and attach it to the
+ * log intent item.
+ * - For each work item attached to the log intent item,
+ * * Perform the described action.
+ * * Attach the work item to the log done item.
+ *
+ * The key here is that we must log an intent item for all pending
+ * work items every time we roll the transaction, and that we must log
+ * a done item as soon as the work is completed. With this mechanism
+ * we can perform complex remapping operations, chaining intent items
+ * as needed.
+ *
+ * This is an example of remapping the extent (E, E+B) into file X at
+ * offset A and dealing with the extent (C, C+B) already being mapped
+ * there:
+ * +-------------------------------------------------+
+ * | Unmap file X startblock C offset A length B | t0
+ * | Intent to reduce refcount for extent (C, B) |
+ * | Intent to remove rmap (X, C, A, B) |
+ * | Intent to free extent (D, 1) (bmbt block) |
+ * | Intent to map (X, A, B) at startblock E |
+ * +-------------------------------------------------+
+ * | Map file X startblock E offset A length B | t1
+ * | Done mapping (X, E, A, B) |
+ * | Intent to increase refcount for extent (E, B) |
+ * | Intent to add rmap (X, E, A, B) |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C, B) | t2
+ * | Done reducing refcount for extent (C, B) |
+ * | Increase refcount for extent (E, B) |
+ * | Done increasing refcount for extent (E, B) |
+ * | Intent to free extent (C, B) |
+ * | Intent to free extent (F, 1) (refcountbt block) |
+ * | Intent to remove rmap (F, 1, REFC) |
+ * +-------------------------------------------------+
+ * | Remove rmap (X, C, A, B) | t3
+ * | Done removing rmap (X, C, A, B) |
+ * | Add rmap (X, E, A, B) |
+ * | Done adding rmap (X, E, A, B) |
+ * | Remove rmap (F, 1, REFC) |
+ * | Done removing rmap (F, 1, REFC) |
+ * +-------------------------------------------------+
+ * | Free extent (C, B) | t4
+ * | Done freeing extent (C, B) |
+ * | Free extent (D, 1) |
+ * | Done freeing extent (D, 1) |
+ * | Free extent (F, 1) |
+ * | Done freeing extent (F, 1) |
+ * +-------------------------------------------------+
+ *
+ * If we should crash before t2 commits, log recovery replays
+ * the following intent items:
+ *
+ * - Intent to reduce refcount for extent (C, B)
+ * - Intent to remove rmap (X, C, A, B)
+ * - Intent to free extent (D, 1) (bmbt block)
+ * - Intent to increase refcount for extent (E, B)
+ * - Intent to add rmap (X, E, A, B)
+ *
+ * In the process of recovering, it should also generate and take care
+ * of these intent items:
+ *
+ * - Intent to free extent (C, B)
+ * - Intent to free extent (F, 1) (refcountbt block)
+ * - Intent to remove rmap (F, 1, REFC)
+ */
+
+static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
+
+/*
+ * For each pending item in the intake list, log its intent item and the
+ * associated extents, then add the entire intake list to the end of
+ * the pending list.
+ */
+STATIC void
+xfs_defer_intake_work(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dop)
+{
+ struct list_head *li;
+ struct xfs_defer_pending *dfp;
+
+ list_for_each_entry(dfp, &dop->dop_intake, dfp_list) {
+ trace_xfs_defer_intake_work(tp->t_mountp, dfp);
+ dfp->dfp_intent = dfp->dfp_type->create_intent(tp,
+ dfp->dfp_count);
+ list_sort(tp->t_mountp, &dfp->dfp_work,
+ dfp->dfp_type->diff_items);
+ list_for_each(li, &dfp->dfp_work)
+ dfp->dfp_type->log_item(tp, dfp->dfp_intent, li);
+ }
+
+ list_splice_tail_init(&dop->dop_intake, &dop->dop_pending);
+}
+
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dop,
+ int error)
+{
+ struct xfs_defer_pending *dfp;
+
+ trace_xfs_defer_trans_abort(tp->t_mountp, dop);
+ /*
+ * If the transaction was committed, drop the intent reference
+ * since we're bailing out of here. The other reference is
+ * dropped when the intent hits the AIL. If the transaction
+ * was not committed, the intent is freed by the intent item
+ * unlock handler on abort.
+ */
+ if (!dop->dop_committed)
+ return;
+
+ /* Abort intent items. */
+ list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
+ trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+ if (dfp->dfp_committed)
+ dfp->dfp_type->abort_intent(dfp->dfp_intent);
+ }
+
+ /* Shut down FS. */
+ xfs_force_shutdown(tp->t_mountp, (error == -EFSCORRUPTED) ?
+ SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR);
+}
+
+/* Roll a transaction so we can do some deferred op processing. */
+STATIC int
+xfs_defer_trans_roll(
+ struct xfs_trans **tp,
+ struct xfs_defer_ops *dop,
+ struct xfs_inode *ip)
+{
+ int i;
+ int error;
+
+ /* Log all the joined inodes except the one we passed in. */
+ for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
+ if (dop->dop_inodes[i] == ip)
+ continue;
+ xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE);
+ }
+
+ trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
+
+ /* Roll the transaction. */
+ error = xfs_trans_roll(tp, ip);
+ if (error) {
+ trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error);
+ xfs_defer_trans_abort(*tp, dop, error);
+ return error;
+ }
+ dop->dop_committed = true;
+
+ /* Rejoin the joined inodes except the one we passed in. */
+ for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
+ if (dop->dop_inodes[i] == ip)
+ continue;
+ xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0);
+ }
+
+ return error;
+}
+
+/* Do we have any work items to finish? */
+bool
+xfs_defer_has_unfinished_work(
+ struct xfs_defer_ops *dop)
+{
+ return !list_empty(&dop->dop_pending) || !list_empty(&dop->dop_intake);
+}
+
+/*
+ * Add this inode to the deferred op. Each joined inode is relogged
+ * each time we roll the transaction, in addition to any inode passed
+ * to xfs_defer_finish().
+ */
+int
+xfs_defer_join(
+ struct xfs_defer_ops *dop,
+ struct xfs_inode *ip)
+{
+ int i;
+
+ for (i = 0; i < XFS_DEFER_OPS_NR_INODES; i++) {
+ if (dop->dop_inodes[i] == ip)
+ return 0;
+ else if (dop->dop_inodes[i] == NULL) {
+ dop->dop_inodes[i] = ip;
+ return 0;
+ }
+ }
+
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Finish all the pending work. This involves logging intent items for
+ * any work items that wandered in since the last transaction roll (if
+ * one has even happened), rolling the transaction, and finishing the
+ * work items in the first item on the logged-and-pending list.
+ *
+ * If an inode is provided, relog it to the new transaction.
+ */
+int
+xfs_defer_finish(
+ struct xfs_trans **tp,
+ struct xfs_defer_ops *dop,
+ struct xfs_inode *ip)
+{
+ struct xfs_defer_pending *dfp;
+ struct list_head *li;
+ struct list_head *n;
+ void *done_item = NULL;
+ void *state;
+ int error = 0;
+ void (*cleanup_fn)(struct xfs_trans *, void *, int);
+
+ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ trace_xfs_defer_finish((*tp)->t_mountp, dop);
+
+ /* Until we run out of pending work to finish... */
+ while (xfs_defer_has_unfinished_work(dop)) {
+ /* Log intents for work items sitting in the intake. */
+ xfs_defer_intake_work(*tp, dop);
+
+ /* Roll the transaction. */
+ error = xfs_defer_trans_roll(tp, dop, ip);
+ if (error)
+ goto out;
+
+ /* Mark all pending intents as committed. */
+ list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) {
+ if (dfp->dfp_committed)
+ break;
+ trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp);
+ dfp->dfp_committed = true;
+ }
+
+ /* Log an intent-done item for the first pending item. */
+ dfp = list_first_entry(&dop->dop_pending,
+ struct xfs_defer_pending, dfp_list);
+ trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
+ done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
+ dfp->dfp_count);
+ cleanup_fn = dfp->dfp_type->finish_cleanup;
+
+ /* Finish the work items. */
+ state = NULL;
+ list_for_each_safe(li, n, &dfp->dfp_work) {
+ list_del(li);
+ dfp->dfp_count--;
+ error = dfp->dfp_type->finish_item(*tp, dop, li,
+ done_item, &state);
+ if (error) {
+ /*
+ * Clean up after ourselves and jump out.
+ * xfs_defer_cancel will take care of freeing
+ * all these lists and stuff.
+ */
+ if (cleanup_fn)
+ cleanup_fn(*tp, state, error);
+ xfs_defer_trans_abort(*tp, dop, error);
+ goto out;
+ }
+ }
+ /* Done with the dfp, free it. */
+ list_del(&dfp->dfp_list);
+ kmem_free(dfp);
+
+ if (cleanup_fn)
+ cleanup_fn(*tp, state, error);
+ }
+
+out:
+ if (error)
+ trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error);
+ else
+ trace_xfs_defer_finish_done((*tp)->t_mountp, dop);
+ return error;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_defer_cancel(
+ struct xfs_defer_ops *dop)
+{
+ struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *pli;
+ struct list_head *pwi;
+ struct list_head *n;
+
+ trace_xfs_defer_cancel(NULL, dop);
+
+ /*
+ * Free the pending items. Caller should already have arranged
+ * for the intent items to be released.
+ */
+ list_for_each_entry_safe(dfp, pli, &dop->dop_intake, dfp_list) {
+ trace_xfs_defer_intake_cancel(NULL, dfp);
+ list_del(&dfp->dfp_list);
+ list_for_each_safe(pwi, n, &dfp->dfp_work) {
+ list_del(pwi);
+ dfp->dfp_count--;
+ dfp->dfp_type->cancel_item(pwi);
+ }
+ ASSERT(dfp->dfp_count == 0);
+ kmem_free(dfp);
+ }
+ list_for_each_entry_safe(dfp, pli, &dop->dop_pending, dfp_list) {
+ trace_xfs_defer_pending_cancel(NULL, dfp);
+ list_del(&dfp->dfp_list);
+ list_for_each_safe(pwi, n, &dfp->dfp_work) {
+ list_del(pwi);
+ dfp->dfp_count--;
+ dfp->dfp_type->cancel_item(pwi);
+ }
+ ASSERT(dfp->dfp_count == 0);
+ kmem_free(dfp);
+ }
+}
+
+/* Add an item for later deferred processing. */
+void
+xfs_defer_add(
+ struct xfs_defer_ops *dop,
+ enum xfs_defer_ops_type type,
+ struct list_head *li)
+{
+ struct xfs_defer_pending *dfp = NULL;
+
+ /*
+ * Add the item to a pending item at the end of the intake list.
+ * If the last pending item has the same type, reuse it. Else,
+ * create a new pending item at the end of the intake list.
+ */
+ if (!list_empty(&dop->dop_intake)) {
+ dfp = list_last_entry(&dop->dop_intake,
+ struct xfs_defer_pending, dfp_list);
+ if (dfp->dfp_type->type != type ||
+ (dfp->dfp_type->max_items &&
+ dfp->dfp_count >= dfp->dfp_type->max_items))
+ dfp = NULL;
+ }
+ if (!dfp) {
+ dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
+ KM_SLEEP | KM_NOFS);
+ dfp->dfp_type = defer_op_types[type];
+ dfp->dfp_committed = false;
+ dfp->dfp_intent = NULL;
+ dfp->dfp_count = 0;
+ INIT_LIST_HEAD(&dfp->dfp_work);
+ list_add_tail(&dfp->dfp_list, &dop->dop_intake);
+ }
+
+ list_add_tail(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+}
+
+/* Initialize a deferred operation list. */
+void
+xfs_defer_init_op_type(
+ const struct xfs_defer_op_type *type)
+{
+ defer_op_types[type->type] = type;
+}
+
+/* Initialize a deferred operation. */
+void
+xfs_defer_init(
+ struct xfs_defer_ops *dop,
+ xfs_fsblock_t *fbp)
+{
+ dop->dop_committed = false;
+ dop->dop_low = false;
+ memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes));
+ *fbp = NULLFSBLOCK;
+ INIT_LIST_HEAD(&dop->dop_intake);
+ INIT_LIST_HEAD(&dop->dop_pending);
+ trace_xfs_defer_init(NULL, dop);
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
new file mode 100644
index 000000000000..cc3981c48296
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_DEFER_H__
+#define __XFS_DEFER_H__
+
+struct xfs_defer_op_type;
+
+/*
+ * Save a log intent item and a list of extents, so that we can replay
+ * whatever action had to happen to the extent list and file the log done
+ * item.
+ */
+struct xfs_defer_pending {
+ const struct xfs_defer_op_type *dfp_type; /* function pointers */
+ struct list_head dfp_list; /* pending items */
+ bool dfp_committed; /* committed trans? */
+ void *dfp_intent; /* log intent item */
+ struct list_head dfp_work; /* work items */
+ unsigned int dfp_count; /* # extent items */
+};
+
+/*
+ * Header for deferred operation list.
+ *
+ * dop_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent. In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs. In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0. If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
+ */
+enum xfs_defer_ops_type {
+ XFS_DEFER_OPS_TYPE_RMAP,
+ XFS_DEFER_OPS_TYPE_FREE,
+ XFS_DEFER_OPS_TYPE_MAX,
+};
+
+#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
+
+struct xfs_defer_ops {
+ bool dop_committed; /* did any trans commit? */
+ bool dop_low; /* alloc in low mode */
+ struct list_head dop_intake; /* unlogged pending work */
+ struct list_head dop_pending; /* logged pending work */
+
+ /* relog these inodes with each roll */
+ struct xfs_inode *dop_inodes[XFS_DEFER_OPS_NR_INODES];
+};
+
+void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type,
+ struct list_head *h);
+int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop,
+ struct xfs_inode *ip);
+void xfs_defer_cancel(struct xfs_defer_ops *dop);
+void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp);
+bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop);
+int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip);
+
+/* Description of a deferred type. */
+struct xfs_defer_op_type {
+ enum xfs_defer_ops_type type;
+ unsigned int max_items;
+ void (*abort_intent)(void *);
+ void *(*create_done)(struct xfs_trans *, void *, unsigned int);
+ int (*finish_item)(struct xfs_trans *, struct xfs_defer_ops *,
+ struct list_head *, void *, void **);
+ void (*finish_cleanup)(struct xfs_trans *, void *, int);
+ void (*cancel_item)(struct list_head *);
+ int (*diff_items)(void *, struct list_head *, struct list_head *);
+ void *(*create_intent)(struct xfs_trans *, uint);
+ void (*log_item)(struct xfs_trans *, void *, struct list_head *);
+};
+
+void xfs_defer_init_op_type(const struct xfs_defer_op_type *type);
+
+#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index af0f9d171f8a..20a96dd5af7e 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -21,6 +21,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
@@ -259,7 +260,7 @@ xfs_dir_createname(
struct xfs_name *name,
xfs_ino_t inum, /* new entry inode number */
xfs_fsblock_t *first, /* bmap's firstblock */
- xfs_bmap_free_t *flist, /* bmap's freeblock list */
+ struct xfs_defer_ops *dfops, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
struct xfs_da_args *args;
@@ -286,7 +287,7 @@ xfs_dir_createname(
args->inumber = inum;
args->dp = dp;
args->firstblock = first;
- args->flist = flist;
+ args->dfops = dfops;
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
@@ -436,7 +437,7 @@ xfs_dir_removename(
struct xfs_name *name,
xfs_ino_t ino,
xfs_fsblock_t *first, /* bmap's firstblock */
- xfs_bmap_free_t *flist, /* bmap's freeblock list */
+ struct xfs_defer_ops *dfops, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
struct xfs_da_args *args;
@@ -458,7 +459,7 @@ xfs_dir_removename(
args->inumber = ino;
args->dp = dp;
args->firstblock = first;
- args->flist = flist;
+ args->dfops = dfops;
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
@@ -498,7 +499,7 @@ xfs_dir_replace(
struct xfs_name *name, /* name of entry to replace */
xfs_ino_t inum, /* new inode number */
xfs_fsblock_t *first, /* bmap's firstblock */
- xfs_bmap_free_t *flist, /* bmap's freeblock list */
+ struct xfs_defer_ops *dfops, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
struct xfs_da_args *args;
@@ -523,7 +524,7 @@ xfs_dir_replace(
args->inumber = inum;
args->dp = dp;
args->firstblock = first;
- args->flist = flist;
+ args->dfops = dfops;
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
@@ -680,7 +681,7 @@ xfs_dir2_shrink_inode(
/* Unmap the fsblock(s). */
error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0,
- args->firstblock, args->flist, &done);
+ args->firstblock, args->dfops, &done);
if (error) {
/*
* ENOSPC actually can happen if we're in a removename with no
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index e55353651f5b..becc926c3e3d 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -18,7 +18,7 @@
#ifndef __XFS_DIR2_H__
#define __XFS_DIR2_H__
-struct xfs_bmap_free;
+struct xfs_defer_ops;
struct xfs_da_args;
struct xfs_inode;
struct xfs_mount;
@@ -129,18 +129,18 @@ extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t inum,
xfs_fsblock_t *first,
- struct xfs_bmap_free *flist, xfs_extlen_t tot);
+ struct xfs_defer_ops *dfops, xfs_extlen_t tot);
extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t *inum,
struct xfs_name *ci_name);
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_fsblock_t *first,
- struct xfs_bmap_free *flist, xfs_extlen_t tot);
+ struct xfs_defer_ops *dfops, xfs_extlen_t tot);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t inum,
xfs_fsblock_t *first,
- struct xfs_bmap_free *flist, xfs_extlen_t tot);
+ struct xfs_defer_ops *dfops, xfs_extlen_t tot);
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index adb204d40f22..f814d42c73b2 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -455,8 +455,10 @@ xfs_sb_has_compat_feature(
}
#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
- (XFS_SB_FEAT_RO_COMPAT_FINOBT)
+ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
+ XFS_SB_FEAT_RO_COMPAT_RMAPBT)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
@@ -538,6 +540,12 @@ static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp)
(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID);
}
+static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);
+}
+
/*
* end of superblock version macros
*/
@@ -598,10 +606,10 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
/*
- * Btree number 0 is bno, 1 is cnt. This value gives the size of the
+ * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the
* arrays below.
*/
-#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1)
+#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1)
/*
* The second word of agf_levels in the first a.g. overlaps the EFS
@@ -618,12 +626,10 @@ typedef struct xfs_agf {
__be32 agf_seqno; /* sequence # starting from 0 */
__be32 agf_length; /* size in blocks of a.g. */
/*
- * Freespace information
+ * Freespace and rmap information
*/
__be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
- __be32 agf_spare0; /* spare field */
__be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
- __be32 agf_spare1; /* spare field */
__be32 agf_flfirst; /* first freelist block's index */
__be32 agf_fllast; /* last freelist block's index */
@@ -1308,17 +1314,118 @@ typedef __be32 xfs_inobt_ptr_t;
#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
/*
- * The first data block of an AG depends on whether the filesystem was formatted
- * with the finobt feature. If so, account for the finobt reserved root btree
- * block.
+ * Reverse mapping btree format definitions
+ *
+ * There is a btree for the reverse map per allocation group
+ */
+#define XFS_RMAP_CRC_MAGIC 0x524d4233 /* 'RMB3' */
+
+/*
+ * Ownership info for an extent. This is used to create reverse-mapping
+ * entries.
*/
-#define XFS_PREALLOC_BLOCKS(mp) \
+#define XFS_OWNER_INFO_ATTR_FORK (1 << 0)
+#define XFS_OWNER_INFO_BMBT_BLOCK (1 << 1)
+struct xfs_owner_info {
+ uint64_t oi_owner;
+ xfs_fileoff_t oi_offset;
+ unsigned int oi_flags;
+};
+
+/*
+ * Special owner types.
+ *
+ * Seeing as we only support up to 8EB, we have the upper bit of the owner field
+ * to tell us we have a special owner value. We use these for static metadata
+ * allocated at mkfs/growfs time, as well as for freespace management metadata.
+ */
+#define XFS_RMAP_OWN_NULL (-1ULL) /* No owner, for growfs */
+#define XFS_RMAP_OWN_UNKNOWN (-2ULL) /* Unknown owner, for EFI recovery */
+#define XFS_RMAP_OWN_FS (-3ULL) /* static fs metadata */
+#define XFS_RMAP_OWN_LOG (-4ULL) /* static fs metadata */
+#define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */
+#define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */
+#define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */
+#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */
+
+#define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63)))
+
+/*
+ * Data record structure
+ */
+struct xfs_rmap_rec {
+ __be32 rm_startblock; /* extent start block */
+ __be32 rm_blockcount; /* extent length */
+ __be64 rm_owner; /* extent owner */
+ __be64 rm_offset; /* offset within the owner */
+};
+
+/*
+ * rmap btree record
+ * rm_offset:63 is the attribute fork flag
+ * rm_offset:62 is the bmbt block flag
+ * rm_offset:61 is the unwritten extent flag (same as l0:63 in bmbt)
+ * rm_offset:54-60 aren't used and should be zero
+ * rm_offset:0-53 is the block offset within the inode
+ */
+#define XFS_RMAP_OFF_ATTR_FORK ((__uint64_t)1ULL << 63)
+#define XFS_RMAP_OFF_BMBT_BLOCK ((__uint64_t)1ULL << 62)
+#define XFS_RMAP_OFF_UNWRITTEN ((__uint64_t)1ULL << 61)
+
+#define XFS_RMAP_LEN_MAX ((__uint32_t)~0U)
+#define XFS_RMAP_OFF_FLAGS (XFS_RMAP_OFF_ATTR_FORK | \
+ XFS_RMAP_OFF_BMBT_BLOCK | \
+ XFS_RMAP_OFF_UNWRITTEN)
+#define XFS_RMAP_OFF_MASK ((__uint64_t)0x3FFFFFFFFFFFFFULL)
+
+#define XFS_RMAP_OFF(off) ((off) & XFS_RMAP_OFF_MASK)
+
+#define XFS_RMAP_IS_BMBT_BLOCK(off) (!!((off) & XFS_RMAP_OFF_BMBT_BLOCK))
+#define XFS_RMAP_IS_ATTR_FORK(off) (!!((off) & XFS_RMAP_OFF_ATTR_FORK))
+#define XFS_RMAP_IS_UNWRITTEN(len) (!!((off) & XFS_RMAP_OFF_UNWRITTEN))
+
+#define RMAPBT_STARTBLOCK_BITLEN 32
+#define RMAPBT_BLOCKCOUNT_BITLEN 32
+#define RMAPBT_OWNER_BITLEN 64
+#define RMAPBT_ATTRFLAG_BITLEN 1
+#define RMAPBT_BMBTFLAG_BITLEN 1
+#define RMAPBT_EXNTFLAG_BITLEN 1
+#define RMAPBT_UNUSED_OFFSET_BITLEN 7
+#define RMAPBT_OFFSET_BITLEN 54
+
+#define XFS_RMAP_ATTR_FORK (1 << 0)
+#define XFS_RMAP_BMBT_BLOCK (1 << 1)
+#define XFS_RMAP_UNWRITTEN (1 << 2)
+#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \
+ XFS_RMAP_BMBT_BLOCK)
+#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN)
+struct xfs_rmap_irec {
+ xfs_agblock_t rm_startblock; /* extent start block */
+ xfs_extlen_t rm_blockcount; /* extent length */
+ __uint64_t rm_owner; /* extent owner */
+ __uint64_t rm_offset; /* offset within the owner */
+ unsigned int rm_flags; /* state flags */
+};
+
+/*
+ * Key structure
+ *
+ * We don't use the length for lookups
+ */
+struct xfs_rmap_key {
+ __be32 rm_startblock; /* extent start block */
+ __be64 rm_owner; /* extent owner */
+ __be64 rm_offset; /* offset within the owner */
+} __attribute__((packed));
+
+/* btree pointer type */
+typedef __be32 xfs_rmap_ptr_t;
+
+#define XFS_RMAP_BLOCK(mp) \
(xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
XFS_FIBT_BLOCK(mp) + 1 : \
XFS_IBT_BLOCK(mp) + 1)
-
-
/*
* BMAP Btree format definitions
*
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index f5ec9c5ccae6..79455058b752 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -206,6 +206,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* Reverse mapping btree */
/*
* Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 4b1e408169a8..51b4e0de1fdc 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -24,6 +24,7 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
@@ -39,6 +40,7 @@
#include "xfs_icache.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_rmap.h"
/*
@@ -614,6 +616,7 @@ xfs_ialloc_ag_alloc(
args.tp = tp;
args.mp = tp->t_mountp;
args.fsbno = NULLFSBLOCK;
+ xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES);
#ifdef DEBUG
/* randomly do sparse inode allocations */
@@ -1817,19 +1820,21 @@ xfs_difree_inode_chunk(
struct xfs_mount *mp,
xfs_agnumber_t agno,
struct xfs_inobt_rec_incore *rec,
- struct xfs_bmap_free *flist)
+ struct xfs_defer_ops *dfops)
{
xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
int startidx, endidx;
int nextbit;
xfs_agblock_t agbno;
int contigblk;
+ struct xfs_owner_info oinfo;
DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno),
- mp->m_ialloc_blks);
+ xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, sagbno),
+ mp->m_ialloc_blks, &oinfo);
return;
}
@@ -1872,8 +1877,8 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno),
- contigblk);
+ xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, agbno),
+ contigblk, &oinfo);
/* reset range to current bit and carry on... */
startidx = endidx = nextbit;
@@ -1889,7 +1894,7 @@ xfs_difree_inobt(
struct xfs_trans *tp,
struct xfs_buf *agbp,
xfs_agino_t agino,
- struct xfs_bmap_free *flist,
+ struct xfs_defer_ops *dfops,
struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
@@ -1976,7 +1981,7 @@ xfs_difree_inobt(
goto error0;
}
- xfs_difree_inode_chunk(mp, agno, &rec, flist);
+ xfs_difree_inode_chunk(mp, agno, &rec, dfops);
} else {
xic->deleted = 0;
@@ -2121,7 +2126,7 @@ int
xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
- struct xfs_bmap_free *flist, /* extents to free */
+ struct xfs_defer_ops *dfops, /* extents to free */
struct xfs_icluster *xic) /* cluster info if deleted */
{
/* REFERENCED */
@@ -2173,7 +2178,7 @@ xfs_difree(
/*
* Fix up the inode allocation btree.
*/
- error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
+ error = xfs_difree_inobt(mp, tp, agbp, agino, dfops, xic, &rec);
if (error)
goto error0;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 6e450df2979b..0bb89669fc07 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -95,7 +95,7 @@ int /* error */
xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
- struct xfs_bmap_free *flist, /* extents to free */
+ struct xfs_defer_ops *dfops, /* extents to free */
struct xfs_icluster *ifree); /* cluster info if deleted */
/*
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 89c21d771e35..31ca2208c03d 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -32,6 +32,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_trans.h"
+#include "xfs_rmap.h"
STATIC int
@@ -96,6 +97,7 @@ xfs_inobt_alloc_block(
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
+ xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INOBT);
args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
args.minlen = 1;
args.maxlen = 1;
@@ -125,8 +127,12 @@ xfs_inobt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
+ struct xfs_owner_info oinfo;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
return xfs_free_extent(cur->bc_tp,
- XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1);
+ XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
+ &oinfo);
}
STATIC int
@@ -146,14 +152,6 @@ xfs_inobt_init_key_from_rec(
}
STATIC void
-xfs_inobt_init_rec_from_key(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
-{
- rec->inobt.ir_startino = key->inobt.ir_startino;
-}
-
-STATIC void
xfs_inobt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
@@ -314,7 +312,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.get_minrecs = xfs_inobt_get_minrecs,
.get_maxrecs = xfs_inobt_get_maxrecs,
.init_key_from_rec = xfs_inobt_init_key_from_rec,
- .init_rec_from_key = xfs_inobt_init_rec_from_key,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
@@ -336,7 +333,6 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.get_minrecs = xfs_inobt_get_minrecs,
.get_maxrecs = xfs_inobt_get_maxrecs,
.init_key_from_rec = xfs_inobt_init_key_from_rec,
- .init_rec_from_key = xfs_inobt_init_rec_from_key,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 9d9559eb2835..4b9769e23c83 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -22,6 +22,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_error.h"
#include "xfs_cksum.h"
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index e8f49c029ff0..a6eed43fa7cd 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -110,7 +110,9 @@ static inline uint xlog_get_cycle(char *ptr)
#define XLOG_REG_TYPE_COMMIT 18
#define XLOG_REG_TYPE_TRANSHDR 19
#define XLOG_REG_TYPE_ICREATE 20
-#define XLOG_REG_TYPE_MAX 20
+#define XLOG_REG_TYPE_RUI_FORMAT 21
+#define XLOG_REG_TYPE_RUD_FORMAT 22
+#define XLOG_REG_TYPE_MAX 22
/*
* Flags to log operation header
@@ -227,6 +229,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_DQUOT 0x123d
#define XFS_LI_QUOTAOFF 0x123e
#define XFS_LI_ICREATE 0x123f
+#define XFS_LI_RUI 0x1240 /* rmap update intent */
+#define XFS_LI_RUD 0x1241
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -236,7 +240,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_BUF, "XFS_LI_BUF" }, \
{ XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
{ XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \
- { XFS_LI_ICREATE, "XFS_LI_ICREATE" }
+ { XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \
+ { XFS_LI_RUI, "XFS_LI_RUI" }, \
+ { XFS_LI_RUD, "XFS_LI_RUD" }
/*
* Inode Log Item Format definitions.
@@ -604,6 +610,59 @@ typedef struct xfs_efd_log_format_64 {
} xfs_efd_log_format_64_t;
/*
+ * RUI/RUD (reverse mapping) log format definitions
+ */
+struct xfs_map_extent {
+ __uint64_t me_owner;
+ __uint64_t me_startblock;
+ __uint64_t me_startoff;
+ __uint32_t me_len;
+ __uint32_t me_flags;
+};
+
+/* rmap me_flags: upper bits are flags, lower byte is type code */
+#define XFS_RMAP_EXTENT_MAP 1
+#define XFS_RMAP_EXTENT_UNMAP 3
+#define XFS_RMAP_EXTENT_CONVERT 5
+#define XFS_RMAP_EXTENT_ALLOC 7
+#define XFS_RMAP_EXTENT_FREE 8
+#define XFS_RMAP_EXTENT_TYPE_MASK 0xFF
+
+#define XFS_RMAP_EXTENT_ATTR_FORK (1U << 31)
+#define XFS_RMAP_EXTENT_BMBT_BLOCK (1U << 30)
+#define XFS_RMAP_EXTENT_UNWRITTEN (1U << 29)
+
+#define XFS_RMAP_EXTENT_FLAGS (XFS_RMAP_EXTENT_TYPE_MASK | \
+ XFS_RMAP_EXTENT_ATTR_FORK | \
+ XFS_RMAP_EXTENT_BMBT_BLOCK | \
+ XFS_RMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an rui log item in the
+ * log. The rui_extents field is a variable size array whose
+ * size is given by rui_nextents.
+ */
+struct xfs_rui_log_format {
+ __uint16_t rui_type; /* rui log item type */
+ __uint16_t rui_size; /* size of this item */
+ __uint32_t rui_nextents; /* # extents to free */
+ __uint64_t rui_id; /* rui identifier */
+ struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */
+};
+
+/*
+ * This is the structure used to lay out an rud log item in the
+ * log. The rud_extents array is a variable size array whose
+ * size is given by rud_nextents;
+ */
+struct xfs_rud_log_format {
+ __uint16_t rud_type; /* rud log item type */
+ __uint16_t rud_size; /* size of this item */
+ __uint32_t __pad;
+ __uint64_t rud_rui_id; /* id of corresponding rui */
+};
+
+/*
* Dquot Log format definitions.
*
* The first two fields must be the type and size fitting into
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
new file mode 100644
index 000000000000..73d05407d663
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -0,0 +1,1399 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_bmap.h"
+#include "xfs_inode.h"
+
+/*
+ * Lookup the first record less than or equal to [bno, len, owner, offset]
+ * in the btree given by cur.
+ */
+int
+xfs_rmap_lookup_le(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ int *stat)
+{
+ cur->bc_rec.r.rm_startblock = bno;
+ cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_owner = owner;
+ cur->bc_rec.r.rm_offset = offset;
+ cur->bc_rec.r.rm_flags = flags;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Lookup the record exactly matching [bno, len, owner, offset]
+ * in the btree given by cur.
+ */
+int
+xfs_rmap_lookup_eq(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags,
+ int *stat)
+{
+ cur->bc_rec.r.rm_startblock = bno;
+ cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_owner = owner;
+ cur->bc_rec.r.rm_offset = offset;
+ cur->bc_rec.r.rm_flags = flags;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, owner, offset].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_rmap_update(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *irec)
+{
+ union xfs_btree_rec rec;
+ int error;
+
+ trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno,
+ irec->rm_startblock, irec->rm_blockcount,
+ irec->rm_owner, irec->rm_offset, irec->rm_flags);
+
+ rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock);
+ rec.rmap.rm_blockcount = cpu_to_be32(irec->rm_blockcount);
+ rec.rmap.rm_owner = cpu_to_be64(irec->rm_owner);
+ rec.rmap.rm_offset = cpu_to_be64(
+ xfs_rmap_irec_offset_pack(irec));
+ error = xfs_btree_update(cur, &rec);
+ if (error)
+ trace_xfs_rmap_update_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+int
+xfs_rmap_insert(
+ struct xfs_btree_cur *rcur,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ int i;
+ int error;
+
+ trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+ len, owner, offset, flags);
+
+ error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 0, done);
+
+ rcur->bc_rec.r.rm_startblock = agbno;
+ rcur->bc_rec.r.rm_blockcount = len;
+ rcur->bc_rec.r.rm_owner = owner;
+ rcur->bc_rec.r.rm_offset = offset;
+ rcur->bc_rec.r.rm_flags = flags;
+ error = xfs_btree_insert(rcur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+done:
+ if (error)
+ trace_xfs_rmap_insert_error(rcur->bc_mp,
+ rcur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+static int
+xfs_rmap_btrec_to_irec(
+ union xfs_btree_rec *rec,
+ struct xfs_rmap_irec *irec)
+{
+ irec->rm_flags = 0;
+ irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
+ irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
+ irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
+ return xfs_rmap_irec_offset_unpack(be64_to_cpu(rec->rmap.rm_offset),
+ irec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_rmap_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (error || !*stat)
+ return error;
+
+ return xfs_rmap_btrec_to_irec(rec, irec);
+}
+
+/*
+ * Find the extent in the rmap btree and remove it.
+ *
+ * The record we find should always be an exact match for the extent that we're
+ * looking for, since we insert them into the btree without modification.
+ *
+ * Special Case #1: when growing the filesystem, we "free" an extent when
+ * growing the last AG. This extent is new space and so it is not tracked as
+ * used space in the btree. The growfs code will pass in an owner of
+ * XFS_RMAP_OWN_NULL to indicate that it expected that there is no owner of this
+ * extent. We verify that - the extent lookup result in a record that does not
+ * overlap.
+ *
+ * Special Case #2: EFIs do not record the owner of the extent, so when
+ * recovering EFIs from the log we pass in XFS_RMAP_OWN_UNKNOWN to tell the rmap
+ * btree to ignore the owner (i.e. wildcard match) so we don't trigger
+ * corruption checks during log recovery.
+ */
+STATIC int
+xfs_rmap_unmap(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ uint64_t ltoff;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+ bool ignore_off;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & XFS_RMAP_BMBT_BLOCK);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * We should always have a left record because there's a static record
+ * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+ * will not ever be removed from the tree.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+
+ error = xfs_rmap_get_rec(cur, &ltrec, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_private.a.agno, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+ ltoff = ltrec.rm_offset;
+
+ /*
+ * For growfs, the incoming extent must be beyond the left record we
+ * just found as it is new space and won't be used by anyone. This is
+ * just a corruption check as we don't actually do anything with this
+ * extent. Note that we need to use >= instead of > because it might
+ * be the case that the "left" extent goes all the way to EOFS.
+ */
+ if (owner == XFS_RMAP_OWN_NULL) {
+ XFS_WANT_CORRUPTED_GOTO(mp, bno >= ltrec.rm_startblock +
+ ltrec.rm_blockcount, out_error);
+ goto out_done;
+ }
+
+ /* Make sure the unwritten flag matches. */
+ XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
+ (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+
+ /* Make sure the extent we found covers the entire freeing range. */
+ XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
+ ltrec.rm_startblock + ltrec.rm_blockcount >=
+ bno + len, out_error);
+
+ /* Make sure the owner matches what we expect to find in the tree. */
+ XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner ||
+ XFS_RMAP_NON_INODE_OWNER(owner), out_error);
+
+ /* Check the offset, if necessary. */
+ if (!XFS_RMAP_NON_INODE_OWNER(owner)) {
+ if (flags & XFS_RMAP_BMBT_BLOCK) {
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK,
+ out_error);
+ } else {
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ ltrec.rm_offset <= offset, out_error);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ ltoff + ltrec.rm_blockcount >= offset + len,
+ out_error);
+ }
+ }
+
+ if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+ /* exact match, simply remove the record from rmap tree */
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ ltrec.rm_startblock, ltrec.rm_blockcount,
+ ltrec.rm_owner, ltrec.rm_offset,
+ ltrec.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ } else if (ltrec.rm_startblock == bno) {
+ /*
+ * overlap left hand side of extent: move the start, trim the
+ * length and update the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_startblock += len;
+ ltrec.rm_blockcount -= len;
+ if (!ignore_off)
+ ltrec.rm_offset += len;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+ /*
+ * overlap right hand side of extent: trim the length and update
+ * the current record.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else {
+
+ /*
+ * overlap middle of extent: trim the length of the existing
+ * record to the length of the new left-extent size, increment
+ * the insertion position so we can insert a new record
+ * containing the remaining right-extent space.
+ *
+ * ltbno ltlen
+ * Orig: |oooooooooooooooooooo|
+ * Freeing: |fffffffff|
+ * Result: |rrrrr| |rrrr|
+ * bno len
+ */
+ xfs_extlen_t orig_len = ltrec.rm_blockcount;
+
+ ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto out_error;
+
+ cur->bc_rec.r.rm_startblock = bno + len;
+ cur->bc_rec.r.rm_blockcount = orig_len - len -
+ ltrec.rm_blockcount;
+ cur->bc_rec.r.rm_owner = ltrec.rm_owner;
+ if (ignore_off)
+ cur->bc_rec.r.rm_offset = 0;
+ else
+ cur->bc_rec.r.rm_offset = offset + len;
+ cur->bc_rec.r.rm_flags = flags;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+ cur->bc_rec.r.rm_startblock,
+ cur->bc_rec.r.rm_blockcount,
+ cur->bc_rec.r.rm_owner,
+ cur->bc_rec.r.rm_offset,
+ cur->bc_rec.r.rm_flags);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto out_error;
+ }
+
+out_done:
+ trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Remove a reference to an extent in the rmap btree.
+ */
+int
+xfs_rmap_free(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return 0;
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+
+ error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
+ if (error)
+ goto out_error;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+out_error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * A mergeable rmap must have the same owner and the same values for
+ * the unwritten, attr_fork, and bmbt flags. The startblock and
+ * offset are checked separately.
+ */
+static bool
+xfs_rmap_is_mergeable(
+ struct xfs_rmap_irec *irec,
+ uint64_t owner,
+ unsigned int flags)
+{
+ if (irec->rm_owner == XFS_RMAP_OWN_NULL)
+ return false;
+ if (irec->rm_owner != owner)
+ return false;
+ if ((flags & XFS_RMAP_UNWRITTEN) ^
+ (irec->rm_flags & XFS_RMAP_UNWRITTEN))
+ return false;
+ if ((flags & XFS_RMAP_ATTR_FORK) ^
+ (irec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return false;
+ if ((flags & XFS_RMAP_BMBT_BLOCK) ^
+ (irec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ return false;
+ return true;
+}
+
+/*
+ * When we allocate a new block, the first thing we do is add a reference to
+ * the extent in the rmap btree. This takes the form of a [agbno, length,
+ * owner, offset] record. Flags are encoded in the high bits of the offset
+ * field.
+ */
+STATIC int
+xfs_rmap_map(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ struct xfs_rmap_irec gtrec;
+ int have_gt;
+ int have_lt;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags = 0;
+ bool ignore_off;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ASSERT(owner != 0);
+ ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & XFS_RMAP_BMBT_BLOCK);
+ if (unwritten)
+ flags |= XFS_RMAP_UNWRITTEN;
+ trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * For the initial lookup, look for an exact match or the left-adjacent
+ * record for our insertion point. This will also give us the record for
+ * start block contiguity tests.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ &have_lt);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error);
+
+ error = xfs_rmap_get_rec(cur, &ltrec, &have_lt);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error);
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_private.a.agno, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
+
+ if (!xfs_rmap_is_mergeable(&ltrec, owner, flags))
+ have_lt = 0;
+
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ have_lt == 0 ||
+ ltrec.rm_startblock + ltrec.rm_blockcount <= bno, out_error);
+
+ /*
+ * Increment the cursor to see if we have a right-adjacent record to our
+ * insertion point. This will give us the record for end block
+ * contiguity tests.
+ */
+ error = xfs_btree_increment(cur, 0, &have_gt);
+ if (error)
+ goto out_error;
+ if (have_gt) {
+ error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
+ XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= gtrec.rm_startblock,
+ out_error);
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
+ if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+ have_gt = 0;
+ }
+
+ /*
+ * Note: cursor currently points one record to the right of ltrec, even
+ * if there is no record in the tree to the right.
+ */
+ if (have_lt &&
+ ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+ (ignore_off || ltrec.rm_offset + ltrec.rm_blockcount == offset)) {
+ /*
+ * left edge contiguous, merge into left record.
+ *
+ * ltbno ltlen
+ * orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ ltrec.rm_blockcount += len;
+ if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ (ignore_off || offset + len == gtrec.rm_offset) &&
+ (unsigned long)ltrec.rm_blockcount + len +
+ gtrec.rm_blockcount <= XFS_RMAP_LEN_MAX) {
+ /*
+ * right edge also contiguous, delete right record
+ * and merge into left record.
+ *
+ * ltbno ltlen gtbno gtlen
+ * orig: |ooooooooo| |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+ */
+ ltrec.rm_blockcount += gtrec.rm_blockcount;
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ gtrec.rm_startblock,
+ gtrec.rm_blockcount,
+ gtrec.rm_owner,
+ gtrec.rm_offset,
+ gtrec.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ }
+
+ /* point the cursor back to the left record and update */
+ error = xfs_btree_decrement(cur, 0, &have_gt);
+ if (error)
+ goto out_error;
+ error = xfs_rmap_update(cur, &ltrec);
+ if (error)
+ goto out_error;
+ } else if (have_gt &&
+ bno + len == gtrec.rm_startblock &&
+ (ignore_off || offset + len == gtrec.rm_offset)) {
+ /*
+ * right edge contiguous, merge into right record.
+ *
+ * gtbno gtlen
+ * Orig: |ooooooooo|
+ * adding: |aaaaaaaaa|
+ * Result: |rrrrrrrrrrrrrrrrrrr|
+ * bno len
+ */
+ gtrec.rm_startblock = bno;
+ gtrec.rm_blockcount += len;
+ if (!ignore_off)
+ gtrec.rm_offset = offset;
+ error = xfs_rmap_update(cur, &gtrec);
+ if (error)
+ goto out_error;
+ } else {
+ /*
+ * no contiguous edge with identical owner, insert
+ * new record at current cursor position.
+ */
+ cur->bc_rec.r.rm_startblock = bno;
+ cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_owner = owner;
+ cur->bc_rec.r.rm_offset = offset;
+ cur->bc_rec.r.rm_flags = flags;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+ owner, offset, flags);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ }
+
+ trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+out_error:
+ if (error)
+ trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno,
+ error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Add a reference to an extent in the rmap btree.
+ */
+int
+xfs_rmap_alloc(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return 0;
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+ error = xfs_rmap_map(cur, bno, len, false, oinfo);
+ if (error)
+ goto out_error;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+out_error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+#define RMAP_LEFT_CONTIG (1 << 0)
+#define RMAP_RIGHT_CONTIG (1 << 1)
+#define RMAP_LEFT_FILLING (1 << 2)
+#define RMAP_RIGHT_FILLING (1 << 3)
+#define RMAP_LEFT_VALID (1 << 6)
+#define RMAP_RIGHT_VALID (1 << 7)
+
+#define LEFT r[0]
+#define RIGHT r[1]
+#define PREV r[2]
+#define NEW r[3]
+
+/*
+ * Convert an unwritten extent to a real extent or vice versa.
+ * Does not handle overlapping extents.
+ */
+STATIC int
+xfs_rmap_convert(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec r[4]; /* neighbor extent entries */
+ /* left is 0, right is 1, prev is 2 */
+ /* new is 3 */
+ uint64_t owner;
+ uint64_t offset;
+ uint64_t new_endoff;
+ unsigned int oldext;
+ unsigned int newext;
+ unsigned int flags = 0;
+ int i;
+ int state = 0;
+ int error;
+
+ xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+ ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+ (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+ oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+ new_endoff = offset + len;
+ trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+
+ /*
+ * For the initial lookup, look for an exact match or the left-adjacent
+ * record for our insertion point. This will also give us the record for
+ * start block contiguity tests.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+
+ error = xfs_rmap_get_rec(cur, &PREV, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_private.a.agno, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
+
+ ASSERT(PREV.rm_offset <= offset);
+ ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+ ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+ newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+ /*
+ * Set flags determining what part of the previous oldext allocation
+ * extent is being replaced by a newext allocation.
+ */
+ if (PREV.rm_offset == offset)
+ state |= RMAP_LEFT_FILLING;
+ if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+ state |= RMAP_RIGHT_FILLING;
+
+ /*
+ * Decrement the cursor to see if we have a left-adjacent record to our
+ * insertion point. This will give us the record for end block
+ * contiguity tests.
+ */
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_LEFT_VALID;
+ error = xfs_rmap_get_rec(cur, &LEFT, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
+ done);
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, LEFT.rm_startblock,
+ LEFT.rm_blockcount, LEFT.rm_owner,
+ LEFT.rm_offset, LEFT.rm_flags);
+ if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
+ LEFT.rm_offset + LEFT.rm_blockcount == offset &&
+ xfs_rmap_is_mergeable(&LEFT, owner, newext))
+ state |= RMAP_LEFT_CONTIG;
+ }
+
+ /*
+ * Increment the cursor to see if we have a right-adjacent record to our
+ * insertion point. This will give us the record for end block
+ * contiguity tests.
+ */
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ if (i) {
+ state |= RMAP_RIGHT_VALID;
+ error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
+ done);
+ trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+ cur->bc_private.a.agno, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
+ if (bno + len == RIGHT.rm_startblock &&
+ offset + len == RIGHT.rm_offset &&
+ xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+ state |= RMAP_RIGHT_CONTIG;
+ }
+
+ /* check that left + prev + right is not too long */
+ if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+ (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+ (unsigned long)LEFT.rm_blockcount + len +
+ RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+ state &= ~RMAP_RIGHT_CONTIG;
+
+ trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+ _RET_IP_);
+
+ /* reset the cursor back to PREV */
+ error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+
+ /*
+ * Switch out based on the FILLING and CONTIG state bits.
+ */
+ switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+ RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left and right neighbors are both contiguous with new.
+ */
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ RIGHT.rm_startblock, RIGHT.rm_blockcount,
+ RIGHT.rm_owner, RIGHT.rm_offset,
+ RIGHT.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ PREV.rm_startblock, PREV.rm_blockcount,
+ PREV.rm_owner, PREV.rm_offset,
+ PREV.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW = LEFT;
+ NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left neighbor is contiguous, the right is not.
+ */
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ PREV.rm_startblock, PREV.rm_blockcount,
+ PREV.rm_owner, PREV.rm_offset,
+ PREV.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW = LEFT;
+ NEW.rm_blockcount += PREV.rm_blockcount;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The right neighbor is contiguous, the left is not.
+ */
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ RIGHT.rm_startblock, RIGHT.rm_blockcount,
+ RIGHT.rm_owner, RIGHT.rm_offset,
+ RIGHT.rm_flags);
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ NEW = PREV;
+ NEW.rm_blockcount = len + RIGHT.rm_blockcount;
+ NEW.rm_flags = newext;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * Neither the left nor right neighbors are contiguous with
+ * the new one.
+ */
+ NEW = PREV;
+ NEW.rm_flags = newext;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is contiguous.
+ */
+ NEW = PREV;
+ NEW.rm_offset += len;
+ NEW.rm_startblock += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto done;
+ NEW = LEFT;
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_LEFT_FILLING:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is not contiguous.
+ */
+ NEW = PREV;
+ NEW.rm_startblock += len;
+ NEW.rm_offset += len;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ NEW.rm_startblock = bno;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = offset;
+ NEW.rm_blockcount = len;
+ NEW.rm_flags = newext;
+ cur->bc_rec.r = NEW;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+ len, owner, offset, newext);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ break;
+
+ case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is contiguous with the new allocation.
+ */
+ NEW = PREV;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ NEW = RIGHT;
+ NEW.rm_offset = offset;
+ NEW.rm_startblock = bno;
+ NEW.rm_blockcount += len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ break;
+
+ case RMAP_RIGHT_FILLING:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is not contiguous.
+ */
+ NEW = PREV;
+ NEW.rm_blockcount -= len;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset,
+ oldext, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ NEW.rm_startblock = bno;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = offset;
+ NEW.rm_blockcount = len;
+ NEW.rm_flags = newext;
+ cur->bc_rec.r = NEW;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+ len, owner, offset, newext);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ break;
+
+ case 0:
+ /*
+ * Setting the middle part of a previous oldext extent to
+ * newext. Contiguity is impossible here.
+ * One extent becomes three extents.
+ */
+ /* new right extent - oldext */
+ NEW.rm_startblock = bno + len;
+ NEW.rm_owner = owner;
+ NEW.rm_offset = new_endoff;
+ NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+ new_endoff;
+ NEW.rm_flags = PREV.rm_flags;
+ error = xfs_rmap_update(cur, &NEW);
+ if (error)
+ goto done;
+ /* new left extent - oldext */
+ NEW = PREV;
+ NEW.rm_blockcount = offset - PREV.rm_offset;
+ cur->bc_rec.r = NEW;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+ NEW.rm_startblock, NEW.rm_blockcount,
+ NEW.rm_owner, NEW.rm_offset,
+ NEW.rm_flags);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ /*
+ * Reset the cursor to the position of the new extent
+ * we are about to insert as we can't trust it after
+ * the previous insert.
+ */
+ error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset,
+ oldext, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ /* new middle extent - newext */
+ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
+ cur->bc_rec.r.rm_flags |= newext;
+ trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+ owner, offset, newext);
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ break;
+
+ case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+ case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+ case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+ case RMAP_LEFT_CONTIG:
+ case RMAP_RIGHT_CONTIG:
+ /*
+ * These cases are all impossible.
+ */
+ ASSERT(0);
+ }
+
+ trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+ unwritten, oinfo);
+done:
+ if (error)
+ trace_xfs_rmap_convert_error(cur->bc_mp,
+ cur->bc_private.a.agno, error, _RET_IP_);
+ return error;
+}
+
+#undef NEW
+#undef LEFT
+#undef RIGHT
+#undef PREV
+
+struct xfs_rmap_query_range_info {
+ xfs_rmap_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_rmap_query_range_helper(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_rmap_query_range_info *query = priv;
+ struct xfs_rmap_irec irec;
+ int error;
+
+ error = xfs_rmap_btrec_to_irec(rec, &irec);
+ if (error)
+ return error;
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all rmaps between two keys. */
+int
+xfs_rmap_query_range(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *low_rec,
+ struct xfs_rmap_irec *high_rec,
+ xfs_rmap_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_brec;
+ union xfs_btree_irec high_brec;
+ struct xfs_rmap_query_range_info query;
+
+ low_brec.r = *low_rec;
+ high_brec.r = *high_rec;
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_range(cur, &low_brec, &high_brec,
+ xfs_rmap_query_range_helper, &query);
+}
+
+/* Clean up after calling xfs_rmap_finish_one. */
+void
+xfs_rmap_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
+{
+ struct xfs_buf *agbp;
+
+ if (rcur == NULL)
+ return;
+ agbp = rcur->bc_private.a.agbp;
+ xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ if (error)
+ xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Process one of the deferred rmap operations. We pass back the
+ * btree cursor to maintain our lock on the rmapbt between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_rmap_finish_one(
+ struct xfs_trans *tp,
+ enum xfs_rmap_intent_type type,
+ __uint64_t owner,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state,
+ struct xfs_btree_cur **pcur)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *rcur;
+ struct xfs_buf *agbp = NULL;
+ int error = 0;
+ xfs_agnumber_t agno;
+ struct xfs_owner_info oinfo;
+ xfs_agblock_t bno;
+ bool unwritten;
+
+ agno = XFS_FSB_TO_AGNO(mp, startblock);
+ ASSERT(agno != NULLAGNUMBER);
+ bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+ trace_xfs_rmap_deferred(mp, agno, type, bno, owner, whichfork,
+ startoff, blockcount, state);
+
+ if (XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_RMAP_FINISH_ONE,
+ XFS_RANDOM_RMAP_FINISH_ONE))
+ return -EIO;
+
+ /*
+ * If we haven't gotten a cursor or the cursor AG doesn't match
+ * the startblock, get one now.
+ */
+ rcur = *pcur;
+ if (rcur != NULL && rcur->bc_private.a.agno != agno) {
+ xfs_rmap_finish_one_cleanup(tp, rcur, 0);
+ rcur = NULL;
+ *pcur = NULL;
+ }
+ if (rcur == NULL) {
+ /*
+ * Refresh the freelist before we start changing the
+ * rmapbt, because a shape change could cause us to
+ * allocate blocks.
+ */
+ error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+ if (error)
+ return error;
+ if (!agbp)
+ return -EFSCORRUPTED;
+
+ rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+ if (!rcur) {
+ error = -ENOMEM;
+ goto out_cur;
+ }
+ }
+ *pcur = rcur;
+
+ xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff);
+ unwritten = state == XFS_EXT_UNWRITTEN;
+ bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock);
+
+ switch (type) {
+ case XFS_RMAP_ALLOC:
+ case XFS_RMAP_MAP:
+ error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
+ break;
+ case XFS_RMAP_FREE:
+ case XFS_RMAP_UNMAP:
+ error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
+ &oinfo);
+ break;
+ case XFS_RMAP_CONVERT:
+ error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
+ &oinfo);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+ return error;
+
+out_cur:
+ xfs_trans_brelse(tp, agbp);
+
+ return error;
+}
+
+/*
+ * Don't defer an rmap if we aren't an rmap filesystem.
+ */
+static bool
+xfs_rmap_update_is_needed(
+ struct xfs_mount *mp)
+{
+ return xfs_sb_version_hasrmapbt(&mp->m_sb);
+}
+
+/*
+ * Record a rmap intent; the list is kept sorted first by AG and then by
+ * increasing age.
+ */
+static int
+__xfs_rmap_add(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ enum xfs_rmap_intent_type type,
+ __uint64_t owner,
+ int whichfork,
+ struct xfs_bmbt_irec *bmap)
+{
+ struct xfs_rmap_intent *ri;
+
+ trace_xfs_rmap_defer(mp, XFS_FSB_TO_AGNO(mp, bmap->br_startblock),
+ type,
+ XFS_FSB_TO_AGBNO(mp, bmap->br_startblock),
+ owner, whichfork,
+ bmap->br_startoff,
+ bmap->br_blockcount,
+ bmap->br_state);
+
+ ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS);
+ INIT_LIST_HEAD(&ri->ri_list);
+ ri->ri_type = type;
+ ri->ri_owner = owner;
+ ri->ri_whichfork = whichfork;
+ ri->ri_bmap = *bmap;
+
+ xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
+ return 0;
+}
+
+/* Map an extent into a file. */
+int
+xfs_rmap_map_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_rmap_update_is_needed(mp))
+ return 0;
+
+ return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino,
+ whichfork, PREV);
+}
+
+/* Unmap an extent out of a file. */
+int
+xfs_rmap_unmap_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_rmap_update_is_needed(mp))
+ return 0;
+
+ return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino,
+ whichfork, PREV);
+}
+
+/* Convert a data fork extent from unwritten to real or vice versa. */
+int
+xfs_rmap_convert_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *PREV)
+{
+ if (!xfs_rmap_update_is_needed(mp))
+ return 0;
+
+ return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino,
+ whichfork, PREV);
+}
+
+/* Schedule the creation of an rmap for non-file data. */
+int
+xfs_rmap_alloc_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ __uint64_t owner)
+{
+ struct xfs_bmbt_irec bmap;
+
+ if (!xfs_rmap_update_is_needed(mp))
+ return 0;
+
+ bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
+ bmap.br_blockcount = len;
+ bmap.br_startoff = 0;
+ bmap.br_state = XFS_EXT_NORM;
+
+ return __xfs_rmap_add(mp, dfops, XFS_RMAP_ALLOC, owner,
+ XFS_DATA_FORK, &bmap);
+}
+
+/* Schedule the deletion of an rmap for non-file data. */
+int
+xfs_rmap_free_extent(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ __uint64_t owner)
+{
+ struct xfs_bmbt_irec bmap;
+
+ if (!xfs_rmap_update_is_needed(mp))
+ return 0;
+
+ bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
+ bmap.br_blockcount = len;
+ bmap.br_startoff = 0;
+ bmap.br_state = XFS_EXT_NORM;
+
+ return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner,
+ XFS_DATA_FORK, &bmap);
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
new file mode 100644
index 000000000000..71cf99a4acba
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (C) 2016 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_RMAP_H__
+#define __XFS_RMAP_H__
+
+static inline void
+xfs_rmap_ag_owner(
+ struct xfs_owner_info *oi,
+ uint64_t owner)
+{
+ oi->oi_owner = owner;
+ oi->oi_offset = 0;
+ oi->oi_flags = 0;
+}
+
+static inline void
+xfs_rmap_ino_bmbt_owner(
+ struct xfs_owner_info *oi,
+ xfs_ino_t ino,
+ int whichfork)
+{
+ oi->oi_owner = ino;
+ oi->oi_offset = 0;
+ oi->oi_flags = XFS_OWNER_INFO_BMBT_BLOCK;
+ if (whichfork == XFS_ATTR_FORK)
+ oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+}
+
+static inline void
+xfs_rmap_ino_owner(
+ struct xfs_owner_info *oi,
+ xfs_ino_t ino,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ oi->oi_owner = ino;
+ oi->oi_offset = offset;
+ oi->oi_flags = 0;
+ if (whichfork == XFS_ATTR_FORK)
+ oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+}
+
+static inline void
+xfs_rmap_skip_owner_update(
+ struct xfs_owner_info *oi)
+{
+ oi->oi_owner = XFS_RMAP_OWN_UNKNOWN;
+}
+
+/* Reverse mapping functions. */
+
+struct xfs_buf;
+
+static inline __u64
+xfs_rmap_irec_offset_pack(
+ const struct xfs_rmap_irec *irec)
+{
+ __u64 x;
+
+ x = XFS_RMAP_OFF(irec->rm_offset);
+ if (irec->rm_flags & XFS_RMAP_ATTR_FORK)
+ x |= XFS_RMAP_OFF_ATTR_FORK;
+ if (irec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ x |= XFS_RMAP_OFF_BMBT_BLOCK;
+ if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+ x |= XFS_RMAP_OFF_UNWRITTEN;
+ return x;
+}
+
+static inline int
+xfs_rmap_irec_offset_unpack(
+ __u64 offset,
+ struct xfs_rmap_irec *irec)
+{
+ if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
+ return -EFSCORRUPTED;
+ irec->rm_offset = XFS_RMAP_OFF(offset);
+ if (offset & XFS_RMAP_OFF_ATTR_FORK)
+ irec->rm_flags |= XFS_RMAP_ATTR_FORK;
+ if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
+ irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
+ if (offset & XFS_RMAP_OFF_UNWRITTEN)
+ irec->rm_flags |= XFS_RMAP_UNWRITTEN;
+ return 0;
+}
+
+static inline void
+xfs_owner_info_unpack(
+ struct xfs_owner_info *oinfo,
+ uint64_t *owner,
+ uint64_t *offset,
+ unsigned int *flags)
+{
+ unsigned int r = 0;
+
+ *owner = oinfo->oi_owner;
+ *offset = oinfo->oi_offset;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ r |= XFS_RMAP_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ r |= XFS_RMAP_BMBT_BLOCK;
+ *flags = r;
+}
+
+static inline void
+xfs_owner_info_pack(
+ struct xfs_owner_info *oinfo,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ oinfo->oi_owner = owner;
+ oinfo->oi_offset = XFS_RMAP_OFF(offset);
+ oinfo->oi_flags = 0;
+ if (flags & XFS_RMAP_ATTR_FORK)
+ oinfo->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (flags & XFS_RMAP_BMBT_BLOCK)
+ oinfo->oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+}
+
+int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp,
+ xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ struct xfs_owner_info *oinfo);
+int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
+ xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ struct xfs_owner_info *oinfo);
+
+int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, uint64_t owner, uint64_t offset,
+ unsigned int flags, int *stat);
+int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, uint64_t owner, uint64_t offset,
+ unsigned int flags, int *stat);
+int xfs_rmap_insert(struct xfs_btree_cur *rcur, xfs_agblock_t agbno,
+ xfs_extlen_t len, uint64_t owner, uint64_t offset,
+ unsigned int flags);
+int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec,
+ int *stat);
+
+typedef int (*xfs_rmap_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv);
+
+int xfs_rmap_query_range(struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec,
+ xfs_rmap_query_range_fn fn, void *priv);
+
+enum xfs_rmap_intent_type {
+ XFS_RMAP_MAP,
+ XFS_RMAP_MAP_SHARED,
+ XFS_RMAP_UNMAP,
+ XFS_RMAP_UNMAP_SHARED,
+ XFS_RMAP_CONVERT,
+ XFS_RMAP_CONVERT_SHARED,
+ XFS_RMAP_ALLOC,
+ XFS_RMAP_FREE,
+};
+
+struct xfs_rmap_intent {
+ struct list_head ri_list;
+ enum xfs_rmap_intent_type ri_type;
+ __uint64_t ri_owner;
+ int ri_whichfork;
+ struct xfs_bmbt_irec ri_bmap;
+};
+
+/* functions for updating the rmapbt based on bmbt map/unmap operations */
+int xfs_rmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *imap);
+int xfs_rmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *imap);
+int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *imap);
+int xfs_rmap_alloc_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ __uint64_t owner);
+int xfs_rmap_free_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ __uint64_t owner);
+
+void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur, int error);
+int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
+ __uint64_t owner, int whichfork, xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock, xfs_filblks_t blockcount,
+ xfs_exntst_t state, struct xfs_btree_cur **pcur);
+
+#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
new file mode 100644
index 000000000000..bc1faebc84ec
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+
+/*
+ * Reverse map btree.
+ *
+ * This is a per-ag tree used to track the owner(s) of a given extent. With
+ * reflink it is possible for there to be multiple owners, which is a departure
+ * from classic XFS. Owner records for data extents are inserted when the
+ * extent is mapped and removed when an extent is unmapped. Owner records for
+ * all other block types (i.e. metadata) are inserted when an extent is
+ * allocated and removed when an extent is freed. There can only be one owner
+ * of a metadata extent, usually an inode or some other metadata structure like
+ * an AG btree.
+ *
+ * The rmap btree is part of the free space management, so blocks for the tree
+ * are sourced from the agfl. Hence we need transaction reservation support for
+ * this tree so that the freelist is always large enough. This also impacts on
+ * the minimum space we need to leave free in the AG.
+ *
+ * The tree is ordered by [ag block, owner, offset]. This is a large key size,
+ * but it is the only way to enforce unique keys when a block can be owned by
+ * multiple files at any offset. There's no need to order/search by extent
+ * size for online updating/management of the tree. It is intended that most
+ * reverse lookups will be to find the owner(s) of a particular block, or to
+ * try to recover tree and file data from corrupt primary metadata.
+ */
+
+static struct xfs_btree_cur *
+xfs_rmapbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.a.agbp, cur->bc_private.a.agno);
+}
+
+STATIC void
+xfs_rmapbt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ int btnum = cur->bc_btnum;
+ struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
+
+ ASSERT(ptr->s != 0);
+
+ agf->agf_roots[btnum] = ptr->s;
+ be32_add_cpu(&agf->agf_levels[btnum], inc);
+ pag->pagf_levels[btnum] += inc;
+ xfs_perag_put(pag);
+
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_rmapbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ int error;
+ xfs_agblock_t bno;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ /* Allocate the new block from the freelist. If we can't, give up. */
+ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+ &bno, 1);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+
+ trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+ bno, 1);
+ if (bno == NULLAGBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
+ false);
+
+ xfs_trans_agbtree_delta(cur->bc_tp, 1);
+ new->s = cpu_to_be32(bno);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+}
+
+STATIC int
+xfs_rmapbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agblock_t bno;
+ int error;
+
+ bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+ trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+ bno, 1);
+ error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+ if (error)
+ return error;
+
+ xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+ xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+ return 0;
+}
+
+STATIC int
+xfs_rmapbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_rmap_mnr[level != 0];
+}
+
+STATIC int
+xfs_rmapbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_rmap_mxr[level != 0];
+}
+
+STATIC void
+xfs_rmapbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->rmap.rm_startblock = rec->rmap.rm_startblock;
+ key->rmap.rm_owner = rec->rmap.rm_owner;
+ key->rmap.rm_offset = rec->rmap.rm_offset;
+}
+
+/*
+ * The high key for a reverse mapping record can be computed by shifting
+ * the startblock and offset to the highest value that would still map
+ * to that record. In practice this means that we add blockcount-1 to
+ * the startblock for all records, and if the record is for a data/attr
+ * fork mapping, we add blockcount-1 to the offset too.
+ */
+STATIC void
+xfs_rmapbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ __uint64_t off;
+ int adj;
+
+ adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
+
+ key->rmap.rm_startblock = rec->rmap.rm_startblock;
+ be32_add_cpu(&key->rmap.rm_startblock, adj);
+ key->rmap.rm_owner = rec->rmap.rm_owner;
+ key->rmap.rm_offset = rec->rmap.rm_offset;
+ if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
+ XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
+ return;
+ off = be64_to_cpu(key->rmap.rm_offset);
+ off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
+ key->rmap.rm_offset = cpu_to_be64(off);
+}
+
+STATIC void
+xfs_rmapbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
+ rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
+ rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
+ rec->rmap.rm_offset = cpu_to_be64(
+ xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
+}
+
+STATIC void
+xfs_rmapbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+
+ ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC __int64_t
+xfs_rmapbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ struct xfs_rmap_irec *rec = &cur->bc_rec.r;
+ struct xfs_rmap_key *kp = &key->rmap;
+ __u64 x, y;
+ __int64_t d;
+
+ d = (__int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+ if (d)
+ return d;
+
+ x = be64_to_cpu(kp->rm_owner);
+ y = rec->rm_owner;
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+
+ x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
+ y = rec->rm_offset;
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ return 0;
+}
+
+STATIC __int64_t
+xfs_rmapbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ struct xfs_rmap_key *kp1 = &k1->rmap;
+ struct xfs_rmap_key *kp2 = &k2->rmap;
+ __int64_t d;
+ __u64 x, y;
+
+ d = (__int64_t)be32_to_cpu(kp1->rm_startblock) -
+ be32_to_cpu(kp2->rm_startblock);
+ if (d)
+ return d;
+
+ x = be64_to_cpu(kp1->rm_owner);
+ y = be64_to_cpu(kp2->rm_owner);
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+
+ x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
+ y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ return 0;
+}
+
+static bool
+xfs_rmapbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ unsigned int level;
+
+ /*
+ * magic number and level verification
+ *
+ * During growfs operations, we can't verify the exact level or owner as
+ * the perag is not fully initialised and hence not attached to the
+ * buffer. In this case, check against the maximum tree depth.
+ *
+ * Similarly, during log recovery we will have a perag structure
+ * attached, but the agf information will not yet have been initialised
+ * from the on disk AGF. Again, we can only check against maximum limits
+ * in this case.
+ */
+ if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+ return false;
+
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ return false;
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
+ return false;
+
+ level = be16_to_cpu(block->bb_level);
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
+ return false;
+ } else if (level >= mp->m_rmap_maxlevels)
+ return false;
+
+ return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
+}
+
+static void
+xfs_rmapbt_read_verify(
+ struct xfs_buf *bp)
+{
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, -EFSBADCRC);
+ else if (!xfs_rmapbt_verify(bp))
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
+ }
+}
+
+static void
+xfs_rmapbt_write_verify(
+ struct xfs_buf *bp)
+{
+ if (!xfs_rmapbt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
+ .name = "xfs_rmapbt",
+ .verify_read = xfs_rmapbt_read_verify,
+ .verify_write = xfs_rmapbt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_rmapbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ __uint32_t x;
+ __uint32_t y;
+ __uint64_t a;
+ __uint64_t b;
+
+ x = be32_to_cpu(k1->rmap.rm_startblock);
+ y = be32_to_cpu(k2->rmap.rm_startblock);
+ if (x < y)
+ return 1;
+ else if (x > y)
+ return 0;
+ a = be64_to_cpu(k1->rmap.rm_owner);
+ b = be64_to_cpu(k2->rmap.rm_owner);
+ if (a < b)
+ return 1;
+ else if (a > b)
+ return 0;
+ a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
+ b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
+ if (a <= b)
+ return 1;
+ return 0;
+}
+
+STATIC int
+xfs_rmapbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ __uint32_t x;
+ __uint32_t y;
+ __uint64_t a;
+ __uint64_t b;
+
+ x = be32_to_cpu(r1->rmap.rm_startblock);
+ y = be32_to_cpu(r2->rmap.rm_startblock);
+ if (x < y)
+ return 1;
+ else if (x > y)
+ return 0;
+ a = be64_to_cpu(r1->rmap.rm_owner);
+ b = be64_to_cpu(r2->rmap.rm_owner);
+ if (a < b)
+ return 1;
+ else if (a > b)
+ return 0;
+ a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
+ b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
+ if (a <= b)
+ return 1;
+ return 0;
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_rmapbt_ops = {
+ .rec_len = sizeof(struct xfs_rmap_rec),
+ .key_len = 2 * sizeof(struct xfs_rmap_key),
+
+ .dup_cursor = xfs_rmapbt_dup_cursor,
+ .set_root = xfs_rmapbt_set_root,
+ .alloc_block = xfs_rmapbt_alloc_block,
+ .free_block = xfs_rmapbt_free_block,
+ .get_minrecs = xfs_rmapbt_get_minrecs,
+ .get_maxrecs = xfs_rmapbt_get_maxrecs,
+ .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
+ .key_diff = xfs_rmapbt_key_diff,
+ .buf_ops = &xfs_rmapbt_buf_ops,
+ .diff_two_keys = xfs_rmapbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_rmapbt_keys_inorder,
+ .recs_inorder = xfs_rmapbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ /* Overlapping btree; 2 keys per pointer. */
+ cur->bc_btnum = XFS_BTNUM_RMAP;
+ cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_ops = &xfs_rmapbt_ops;
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
+
+ return cur;
+}
+
+/*
+ * Calculate number of records in an rmap btree block.
+ */
+int
+xfs_rmapbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_RMAP_BLOCK_LEN;
+
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+}
+
+/* Compute the maximum height of an rmap btree. */
+void
+xfs_rmapbt_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
+ mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
new file mode 100644
index 000000000000..e73a55357dab
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_RMAP_BTREE_H__
+#define __XFS_RMAP_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/* rmaps only exist on crc enabled filesystems */
+#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_RMAP_REC_ADDR(block, index) \
+ ((struct xfs_rmap_rec *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct xfs_rmap_rec))))
+
+#define XFS_RMAP_KEY_ADDR(block, index) \
+ ((struct xfs_rmap_key *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ ((index) - 1) * 2 * sizeof(struct xfs_rmap_key)))
+
+#define XFS_RMAP_HIGH_KEY_ADDR(block, index) \
+ ((struct xfs_rmap_key *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ sizeof(struct xfs_rmap_key) + \
+ ((index) - 1) * 2 * sizeof(struct xfs_rmap_key)))
+
+#define XFS_RMAP_PTR_ADDR(block, index, maxrecs) \
+ ((xfs_rmap_ptr_t *) \
+ ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+ (maxrecs) * 2 * sizeof(struct xfs_rmap_key) + \
+ ((index) - 1) * sizeof(xfs_rmap_ptr_t)))
+
+struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *bp,
+ xfs_agnumber_t agno);
+int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);
+extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
+
+#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 12ca86778e02..0e3d4f5ec33c 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -24,6 +24,7 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
@@ -36,6 +37,7 @@
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
#include "xfs_log.h"
+#include "xfs_rmap_btree.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -729,6 +731,11 @@ xfs_sb_mount_common(
mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+ mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
+ mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
+
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
sbp->sb_inopblock);
@@ -738,6 +745,8 @@ xfs_sb_mount_common(
mp->m_ialloc_min_blks = sbp->sb_spino_align;
else
mp->m_ialloc_min_blks = mp->m_ialloc_blks;
+ mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+ mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
}
/*
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 16002b5ec4eb..0c5b30bd884c 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -38,6 +38,7 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
@@ -116,6 +117,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
#define XFS_INO_BTREE_REF 3
#define XFS_ALLOC_BTREE_REF 2
#define XFS_BMAP_BTREE_REF 2
+#define XFS_RMAP_BTREE_REF 2
#define XFS_DIR_BTREE_REF 2
#define XFS_INO_REF 2
#define XFS_ATTR_BTREE_REF 1
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 68cb1e7bf2bb..301ef2f4dbd6 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -64,6 +64,30 @@ xfs_calc_buf_res(
}
/*
+ * Per-extent log reservation for the btree changes involved in freeing or
+ * allocating an extent. In classic XFS there were two trees that will be
+ * modified (bnobt + cntbt). With rmap enabled, there are three trees
+ * (rmapbt). The number of blocks reserved is based on the formula:
+ *
+ * num trees * ((2 blocks/level * max depth) - 1)
+ *
+ * Keep in mind that max depth is calculated separately for each type of tree.
+ */
+static uint
+xfs_allocfree_log_count(
+ struct xfs_mount *mp,
+ uint num_ops)
+{
+ uint blocks;
+
+ blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
+
+ return blocks;
+}
+
+/*
* Logging inodes is really tricksy. They are logged in memory format,
* which means that what we write into the log doesn't directly translate into
* the amount of space they use on disk.
@@ -126,7 +150,7 @@ xfs_calc_inode_res(
*/
STATIC uint
xfs_calc_finobt_res(
- struct xfs_mount *mp,
+ struct xfs_mount *mp,
int alloc,
int modify)
{
@@ -137,7 +161,7 @@ xfs_calc_finobt_res(
res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
if (alloc)
- res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ res += xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
if (modify)
res += (uint)XFS_FSB_TO_B(mp, 1);
@@ -153,9 +177,9 @@ xfs_calc_finobt_res(
* item logged to try to account for the overhead of the transaction mechanism.
*
* Note: Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish() call.
+ * groups into which they could free extents in the xfs_defer_finish() call.
* This is because the number in the worst case is quite high and quite
- * unusual. In order to fix this we need to change xfs_bmap_finish() to free
+ * unusual. In order to fix this we need to change xfs_defer_finish() to free
* extents in only a single AG at a time. This will require changes to the
* EFI code as well, however, so that the EFI for the extents not freed is
* logged again in each transaction. See SGI PV #261917.
@@ -188,10 +212,10 @@ xfs_calc_write_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -217,10 +241,10 @@ xfs_calc_itruncate_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(5, 0) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(2 + mp->m_ialloc_blks +
mp->m_in_maxlevels, 0)));
@@ -247,7 +271,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3),
XFS_FSB_TO_B(mp, 1))));
}
@@ -286,7 +310,7 @@ xfs_calc_link_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1))));
}
@@ -324,7 +348,7 @@ xfs_calc_remove_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -371,7 +395,7 @@ xfs_calc_create_resv_alloc(
mp->m_sb.sb_sectsize +
xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -399,7 +423,7 @@ xfs_calc_icreate_resv_alloc(
return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
mp->m_sb.sb_sectsize +
xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_finobt_res(mp, 0, 0);
}
@@ -483,7 +507,7 @@ xfs_calc_ifree_reservation(
xfs_calc_buf_res(1, 0) +
xfs_calc_buf_res(2 + mp->m_ialloc_blks +
mp->m_in_maxlevels, 0) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_finobt_res(mp, 0, 1);
}
@@ -513,7 +537,7 @@ xfs_calc_growdata_reservation(
struct xfs_mount *mp)
{
return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -535,7 +559,7 @@ xfs_calc_growrtalloc_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -611,7 +635,7 @@ xfs_calc_addafork_reservation(
xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -634,7 +658,7 @@ xfs_calc_attrinval_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
XFS_FSB_TO_B(mp, 1))));
}
@@ -701,7 +725,7 @@ xfs_calc_attrrm_reservation(
XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
(xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 797815012c0e..0eb46ed6d404 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -68,16 +68,6 @@ struct xfs_trans_resv {
#define M_RES(mp) (&(mp)->m_resv)
/*
- * Per-extent log reservation for the allocation btree changes
- * involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
- */
-#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
- ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1)))
-#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
- ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1)))
-
-/*
* Per-directory log reservation for any directory change.
* dir blocks: (1 btree block per level + data block + free block) * dblock size
* bmap btree: (levels + 2) * max depth * block size
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b79dc66b2ecd..3d503647f26b 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -108,8 +108,8 @@ typedef enum {
} xfs_lookup_t;
typedef enum {
- XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
- XFS_BTNUM_FINOi, XFS_BTNUM_MAX
+ XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
+ XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX
} xfs_btnum_t;
struct xfs_name {