summaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/kmem.c79
-rw-r--r--fs/xfs/kmem.h15
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h7
-rw-r--r--fs/xfs/libxfs/xfs_attr.c79
-rw-r--r--fs/xfs/libxfs/xfs_attr.h6
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c130
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c114
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h11
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c16
-rw-r--r--fs/xfs/libxfs/xfs_btree.c14
-rw-r--r--fs/xfs/libxfs/xfs_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c25
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h4
-rw-r--r--fs/xfs/libxfs/xfs_defer.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c681
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c8
-rw-r--r--fs/xfs/libxfs/xfs_fs.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c9
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c8
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c16
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c50
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h12
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c59
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h11
-rw-r--r--fs/xfs/libxfs/xfs_shared.h6
-rw-r--r--fs/xfs/libxfs/xfs_types.h8
-rw-r--r--fs/xfs/scrub/agheader.c4
-rw-r--r--fs/xfs/scrub/attr.c6
-rw-r--r--fs/xfs/scrub/bmap.c81
-rw-r--r--fs/xfs/scrub/dabtree.c6
-rw-r--r--fs/xfs/scrub/fscounters.c2
-rw-r--r--fs/xfs/scrub/repair.c6
-rw-r--r--fs/xfs/scrub/symlink.c2
-rw-r--r--fs/xfs/xfs_acl.c14
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c8
-rw-r--r--fs/xfs/xfs_bmap_util.c22
-rw-r--r--fs/xfs/xfs_buf.c7
-rw-r--r--fs/xfs/xfs_buf.h6
-rw-r--r--fs/xfs/xfs_buf_item.c4
-rw-r--r--fs/xfs/xfs_dquot.c4
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_extent_busy.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c8
-rw-r--r--fs/xfs/xfs_file.c26
-rw-r--r--fs/xfs/xfs_fsmap.c12
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c85
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_ioctl.c27
-rw-r--r--fs/xfs/xfs_ioctl32.c58
-rw-r--r--fs/xfs/xfs_iomap.c6
-rw-r--r--fs/xfs/xfs_iops.c1
-rw-r--r--fs/xfs/xfs_itable.c13
-rw-r--r--fs/xfs/xfs_itable.h13
-rw-r--r--fs/xfs/xfs_iwalk.c4
-rw-r--r--fs/xfs/xfs_iwalk.h13
-rw-r--r--fs/xfs/xfs_log.c469
-rw-r--r--fs/xfs/xfs_log_cil.c10
-rw-r--r--fs/xfs/xfs_log_recover.c50
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h7
-rw-r--r--fs/xfs/xfs_mru_cache.c4
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_qm.c4
-rw-r--r--fs/xfs/xfs_refcount_item.c16
-rw-r--r--fs/xfs/xfs_reflink.c86
-rw-r--r--fs/xfs/xfs_rmap_item.c6
-rw-r--r--fs/xfs/xfs_rtalloc.c4
-rw-r--r--fs/xfs/xfs_super.c5
-rw-r--r--fs/xfs/xfs_trace.h34
-rw-r--r--fs/xfs/xfs_trans.c4
-rw-r--r--fs/xfs/xfs_trans_dquot.c2
-rw-r--r--fs/xfs/xfs_xattr.c2
81 files changed, 1383 insertions, 1182 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 16bb9a328678..da031b93e182 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -3,10 +3,10 @@
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
-#include <linux/sched/mm.h>
+#include "xfs.h"
#include <linux/backing-dev.h>
-#include "kmem.h"
#include "xfs_message.h"
+#include "xfs_trace.h"
void *
kmem_alloc(size_t size, xfs_km_flags_t flags)
@@ -15,9 +15,11 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
+ trace_kmem_alloc(size, flags, _RET_IP_);
+
do {
ptr = kmalloc(size, lflags);
- if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ if (ptr || (flags & KM_MAYFAIL))
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
@@ -28,28 +30,24 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
} while (1);
}
-void *
-kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+
+/*
+ * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
+ * we need to tell memory reclaim that we are in such a context via
+ * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
+ * and potentially deadlocking.
+ */
+static void *
+__kmem_vmalloc(size_t size, xfs_km_flags_t flags)
{
unsigned nofs_flag = 0;
void *ptr;
- gfp_t lflags;
-
- ptr = kmem_alloc(size, flags | KM_MAYFAIL);
- if (ptr)
- return ptr;
+ gfp_t lflags = kmem_flags_convert(flags);
- /*
- * __vmalloc() will allocate data pages and auxillary structures (e.g.
- * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
- * here. Hence we need to tell memory reclaim that we are in such a
- * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
- * the filesystem here and potentially deadlocking.
- */
if (flags & KM_NOFS)
nofs_flag = memalloc_nofs_save();
- lflags = kmem_flags_convert(flags);
ptr = __vmalloc(size, lflags, PAGE_KERNEL);
if (flags & KM_NOFS)
@@ -58,6 +56,44 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags)
return ptr;
}
+/*
+ * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned
+ * to the @align_mask. We only guarantee alignment up to page size, we'll clamp
+ * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE
+ * aligned region.
+ */
+void *
+kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags)
+{
+ void *ptr;
+
+ trace_kmem_alloc_io(size, flags, _RET_IP_);
+
+ if (WARN_ON_ONCE(align_mask >= PAGE_SIZE))
+ align_mask = PAGE_SIZE - 1;
+
+ ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+ if (ptr) {
+ if (!((uintptr_t)ptr & align_mask))
+ return ptr;
+ kfree(ptr);
+ }
+ return __kmem_vmalloc(size, flags);
+}
+
+void *
+kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+{
+ void *ptr;
+
+ trace_kmem_alloc_large(size, flags, _RET_IP_);
+
+ ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+ if (ptr)
+ return ptr;
+ return __kmem_vmalloc(size, flags);
+}
+
void *
kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
{
@@ -65,9 +101,11 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
+ trace_kmem_realloc(newsize, flags, _RET_IP_);
+
do {
ptr = krealloc(old, newsize, lflags);
- if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ if (ptr || (flags & KM_MAYFAIL))
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
@@ -85,9 +123,10 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
+ trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
do {
ptr = kmem_cache_alloc(zone, lflags);
- if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ if (ptr || (flags & KM_MAYFAIL))
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 267655acd426..8170d95cf930 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -16,8 +16,6 @@
*/
typedef unsigned __bitwise xfs_km_flags_t;
-#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u)
-#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u)
#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
@@ -32,15 +30,11 @@ kmem_flags_convert(xfs_km_flags_t flags)
{
gfp_t lflags;
- BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
+ BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO));
- if (flags & KM_NOSLEEP) {
- lflags = GFP_ATOMIC | __GFP_NOWARN;
- } else {
- lflags = GFP_KERNEL | __GFP_NOWARN;
- if (flags & KM_NOFS)
- lflags &= ~__GFP_FS;
- }
+ lflags = GFP_KERNEL | __GFP_NOWARN;
+ if (flags & KM_NOFS)
+ lflags &= ~__GFP_FS;
/*
* Default page/slab allocator behavior is to retry for ever
@@ -59,6 +53,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
}
extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
static inline void kmem_free(const void *ptr)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 372ad55631fc..533b04aaf6f6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2205,7 +2205,7 @@ xfs_defer_agfl_block(
ASSERT(xfs_bmap_free_item_zone != NULL);
ASSERT(oinfo != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
new->xefi_blockcount = 1;
new->xefi_oinfo = *oinfo;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d6ed5d2c07c2..58fa85cec325 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -81,10 +81,9 @@ typedef struct xfs_alloc_arg {
/*
* Defines for datatype
*/
-#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
-#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
-#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
-#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */
+#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */
+#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */
static inline bool
xfs_alloc_is_userdata(int datatype)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index d48fcf11cc35..510ca6974604 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -97,7 +97,10 @@ xfs_inode_hasattr(
* Overall external interface routines.
*========================================================================*/
-/* Retrieve an extended attribute and its value. Must have ilock. */
+/*
+ * Retrieve an extended attribute and its value. Must have ilock.
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
int
xfs_attr_get_ilocked(
struct xfs_inode *ip,
@@ -115,12 +118,28 @@ xfs_attr_get_ilocked(
return xfs_attr_node_get(args);
}
-/* Retrieve an extended attribute by name, and its value. */
+/*
+ * Retrieve an extended attribute by name, and its value if requested.
+ *
+ * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
+ * just an indication whether the attribute exists and the size of the value if
+ * it exists. The size is returned in @valuelenp,
+ *
+ * If the attribute is found, but exceeds the size limit set by the caller in
+ * @valuelenp, return -ERANGE with the size of the attribute that was found in
+ * @valuelenp.
+ *
+ * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
+ * existence of the attribute has been determined. On success, return that
+ * buffer to the caller and leave them to free it. On failure, free any
+ * allocated buffer and ensure the buffer pointer returned to the caller is
+ * null.
+ */
int
xfs_attr_get(
struct xfs_inode *ip,
const unsigned char *name,
- unsigned char *value,
+ unsigned char **value,
int *valuelenp,
int flags)
{
@@ -128,6 +147,8 @@ xfs_attr_get(
uint lock_mode;
int error;
+ ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
+
XFS_STATS_INC(ip->i_mount, xs_attr_get);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -137,17 +158,29 @@ xfs_attr_get(
if (error)
return error;
- args.value = value;
- args.valuelen = *valuelenp;
/* Entirely possible to look up a name which doesn't exist */
args.op_flags = XFS_DA_OP_OKNOENT;
+ if (flags & ATTR_ALLOC)
+ args.op_flags |= XFS_DA_OP_ALLOCVAL;
+ else
+ args.value = *value;
+ args.valuelen = *valuelenp;
lock_mode = xfs_ilock_attr_map_shared(ip);
error = xfs_attr_get_ilocked(ip, &args);
xfs_iunlock(ip, lock_mode);
-
*valuelenp = args.valuelen;
- return error == -EEXIST ? 0 : error;
+
+ /* on error, we have to clean up allocated value buffers */
+ if (error) {
+ if (flags & ATTR_ALLOC) {
+ kmem_free(args.value);
+ *value = NULL;
+ }
+ return error;
+ }
+ *value = args.value;
+ return 0;
}
/*
@@ -768,6 +801,8 @@ xfs_attr_leaf_removename(
*
* This leaf block cannot have a "remote" value, we only call this routine
* if bmap_one_block() says there is only one block (ie: no remote blks).
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
*/
STATIC int
xfs_attr_leaf_get(xfs_da_args_t *args)
@@ -789,9 +824,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
}
error = xfs_attr3_leaf_getvalue(bp, args);
xfs_trans_brelse(args->trans, bp);
- if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
- error = xfs_attr_rmtval_get(args);
- }
return error;
}
@@ -1268,11 +1300,13 @@ xfs_attr_refillstate(xfs_da_state_t *state)
}
/*
- * Look up a filename in a node attribute list.
+ * Retrieve the attribute data from a node attribute list.
*
* This routine gets called for any attribute fork that has more than one
* block, ie: both true Btree attr lists and for single-leaf-blocks with
* "remote" values taking up more blocks.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
*/
STATIC int
xfs_attr_node_get(xfs_da_args_t *args)
@@ -1294,24 +1328,21 @@ xfs_attr_node_get(xfs_da_args_t *args)
error = xfs_da3_node_lookup_int(state, &retval);
if (error) {
retval = error;
- } else if (retval == -EEXIST) {
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->bp != NULL);
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-
- /*
- * Get the value, local or "remote"
- */
- retval = xfs_attr3_leaf_getvalue(blk->bp, args);
- if (!retval && (args->rmtblkno > 0)
- && !(args->flags & ATTR_KERNOVAL)) {
- retval = xfs_attr_rmtval_get(args);
- }
+ goto out_release;
}
+ if (retval != -EEXIST)
+ goto out_release;
+
+ /*
+ * Get the value, local or "remote"
+ */
+ blk = &state->path.blk[state->path.active - 1];
+ retval = xfs_attr3_leaf_getvalue(blk->bp, args);
/*
* If not in a transaction, we have to release all the buffers.
*/
+out_release:
for (i = 0; i < state->path.active; i++) {
xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index ff28ebf3b635..94badfa1743e 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -37,6 +37,7 @@ struct xfs_attr_list_context;
#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */
+#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */
#define XFS_ATTR_FLAGS \
{ ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
@@ -47,7 +48,8 @@ struct xfs_attr_list_context;
{ ATTR_REPLACE, "REPLACE" }, \
{ ATTR_KERNOTIME, "KERNOTIME" }, \
{ ATTR_KERNOVAL, "KERNOVAL" }, \
- { ATTR_INCOMPLETE, "INCOMPLETE" }
+ { ATTR_INCOMPLETE, "INCOMPLETE" }, \
+ { ATTR_ALLOC, "ALLOC" }
/*
* The maximum size (into the kernel or returned from the kernel) of an
@@ -143,7 +145,7 @@ int xfs_attr_list_int(struct xfs_attr_list_context *);
int xfs_inode_hasattr(struct xfs_inode *ip);
int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
- unsigned char *value, int *valuelenp, int flags);
+ unsigned char **value, int *valuelenp, int flags);
int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
unsigned char *value, int valuelen, int flags);
int xfs_attr_set_args(struct xfs_da_args *args);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 70eb941d02e4..b9f019603d0b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -393,6 +393,50 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
}
+static int
+xfs_attr_copy_value(
+ struct xfs_da_args *args,
+ unsigned char *value,
+ int valuelen)
+{
+ /*
+ * No copy if all we have to do is get the length
+ */
+ if (args->flags & ATTR_KERNOVAL) {
+ args->valuelen = valuelen;
+ return 0;
+ }
+
+ /*
+ * No copy if the length of the existing buffer is too small
+ */
+ if (args->valuelen < valuelen) {
+ args->valuelen = valuelen;
+ return -ERANGE;
+ }
+
+ if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
+ args->value = kmem_alloc_large(valuelen, 0);
+ if (!args->value)
+ return -ENOMEM;
+ }
+ args->valuelen = valuelen;
+
+ /* remote block xattr requires IO for copy-in */
+ if (args->rmtblkno)
+ return xfs_attr_rmtval_get(args);
+
+ /*
+ * This is to prevent a GCC warning because the remote xattr case
+ * doesn't have a value to pass in. In that case, we never reach here,
+ * but GCC can't work that out and so throws a "passing NULL to
+ * memcpy" warning.
+ */
+ if (!value)
+ return -EINVAL;
+ memcpy(args->value, value, valuelen);
+ return 0;
+}
/*========================================================================
* External routines when attribute fork size < XFS_LITINO(mp).
@@ -720,15 +764,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
}
/*
- * Look up a name in a shortform attribute list structure.
+ * Retreive the attribute value and length.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
*/
-/*ARGSUSED*/
int
-xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+xfs_attr_shortform_getvalue(
+ struct xfs_da_args *args)
{
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
- int i;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int i;
ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
@@ -741,18 +789,8 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
continue;
if (!xfs_attr_namesp_match(args->flags, sfe->flags))
continue;
- if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = sfe->valuelen;
- return -EEXIST;
- }
- if (args->valuelen < sfe->valuelen) {
- args->valuelen = sfe->valuelen;
- return -ERANGE;
- }
- args->valuelen = sfe->valuelen;
- memcpy(args->value, &sfe->nameval[args->namelen],
- args->valuelen);
- return -EEXIST;
+ return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
+ sfe->valuelen);
}
return -ENOATTR;
}
@@ -782,7 +820,7 @@ xfs_attr_shortform_to_leaf(
ifp = dp->i_afp;
sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
size = be16_to_cpu(sf->hdr.totsize);
- tmpbuffer = kmem_alloc(size, KM_SLEEP);
+ tmpbuffer = kmem_alloc(size, 0);
ASSERT(tmpbuffer != NULL);
memcpy(tmpbuffer, ifp->if_u1.if_data, size);
sf = (xfs_attr_shortform_t *)tmpbuffer;
@@ -985,7 +1023,7 @@ xfs_attr3_leaf_to_shortform(
trace_xfs_attr_leaf_to_sf(args);
- tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+ tmpbuffer = kmem_alloc(args->geo->blksize, 0);
if (!tmpbuffer)
return -ENOMEM;
@@ -1448,7 +1486,7 @@ xfs_attr3_leaf_compact(
trace_xfs_attr_leaf_compact(args);
- tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+ tmpbuffer = kmem_alloc(args->geo->blksize, 0);
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
memset(bp->b_addr, 0, args->geo->blksize);
leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -2167,7 +2205,7 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr_leafblock *tmp_leaf;
struct xfs_attr3_icleaf_hdr tmphdr;
- tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+ tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
/*
* Copy the header into the temp leaf so that all the stuff
@@ -2350,6 +2388,10 @@ xfs_attr3_leaf_lookup_int(
/*
* Get the value associated with an attribute name from a leaf attribute
* list structure.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
*/
int
xfs_attr3_leaf_getvalue(
@@ -2361,7 +2403,6 @@ xfs_attr3_leaf_getvalue(
struct xfs_attr_leaf_entry *entry;
struct xfs_attr_leaf_name_local *name_loc;
struct xfs_attr_leaf_name_remote *name_rmt;
- int valuelen;
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
@@ -2373,36 +2414,19 @@ xfs_attr3_leaf_getvalue(
name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
ASSERT(name_loc->namelen == args->namelen);
ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
- valuelen = be16_to_cpu(name_loc->valuelen);
- if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = valuelen;
- return 0;
- }
- if (args->valuelen < valuelen) {
- args->valuelen = valuelen;
- return -ERANGE;
- }
- args->valuelen = valuelen;
- memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
- } else {
- name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
- ASSERT(name_rmt->namelen == args->namelen);
- ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
- args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
- args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
- args->rmtvaluelen);
- if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = args->rmtvaluelen;
- return 0;
- }
- if (args->valuelen < args->rmtvaluelen) {
- args->valuelen = args->rmtvaluelen;
- return -ERANGE;
- }
- args->valuelen = args->rmtvaluelen;
- }
- return 0;
+ return xfs_attr_copy_value(args,
+ &name_loc->nameval[args->namelen],
+ be16_to_cpu(name_loc->valuelen));
+ }
+
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+ ASSERT(name_rmt->namelen == args->namelen);
+ ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+ args->rmtvaluelen);
+ return xfs_attr_copy_value(args, NULL, args->rmtvaluelen);
}
/*========================================================================
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4eb30d357045..3e39b7d40f25 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -358,6 +358,8 @@ xfs_attr_rmtval_copyin(
/*
* Read the value associated with an attribute from the out-of-line buffer
* that we stored it in.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
*/
int
xfs_attr_rmtval_get(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index baf0b72c0a37..054b4ce30033 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -553,7 +553,7 @@ __xfs_bmap_add_free(
#endif
ASSERT(xfs_bmap_free_item_zone != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
new->xefi_startblock = bno;
new->xefi_blockcount = (xfs_extlen_t)len;
if (oinfo)
@@ -1099,7 +1099,7 @@ xfs_bmap_add_attrfork(
if (error)
goto trans_cancel;
ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
ip->i_afp->if_flags = XFS_IFEXTENTS;
logflags = 0;
switch (ip->i_d.di_format) {
@@ -1985,11 +1985,8 @@ xfs_bmap_add_extent_delay_real(
}
/* add reverse mapping unless caller opted out */
- if (!(bma->flags & XFS_BMAPI_NORMAP)) {
- error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
- if (error)
- goto done;
- }
+ if (!(bma->flags & XFS_BMAPI_NORMAP))
+ xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2471,9 +2468,7 @@ xfs_bmap_add_extent_unwritten_real(
}
/* update reverse mappings */
- error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
- if (error)
- goto done;
+ xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -2832,11 +2827,8 @@ xfs_bmap_add_extent_hole_real(
}
/* add reverse mapping unless caller opted out */
- if (!(flags & XFS_BMAPI_NORMAP)) {
- error = xfs_rmap_map_extent(tp, ip, whichfork, new);
- if (error)
- goto done;
- }
+ if (!(flags & XFS_BMAPI_NORMAP))
+ xfs_rmap_map_extent(tp, ip, whichfork, new);
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -3835,15 +3827,28 @@ xfs_bmapi_read(
XFS_STATS_INC(mp, xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!ifp) {
+ /* No CoW fork? Return a hole. */
+ if (whichfork == XFS_COW_FORK) {
+ mval->br_startoff = bno;
+ mval->br_startblock = HOLESTARTBLOCK;
+ mval->br_blockcount = len;
+ mval->br_state = XFS_EXT_NORM;
+ *nmap = 1;
+ return 0;
+ }
- /* No CoW fork? Return a hole. */
- if (whichfork == XFS_COW_FORK && !ifp) {
- mval->br_startoff = bno;
- mval->br_startblock = HOLESTARTBLOCK;
- mval->br_blockcount = len;
- mval->br_state = XFS_EXT_NORM;
- *nmap = 1;
- return 0;
+ /*
+ * A missing attr ifork implies that the inode says we're in
+ * extents or btree format but failed to pass the inode fork
+ * verifier while trying to load it. Treat that as a file
+ * corruption too.
+ */
+#ifdef DEBUG
+ xfs_alert(mp, "%s: inode %llu missing fork %d",
+ __func__, ip->i_ino, whichfork);
+#endif /* DEBUG */
+ return -EFSCORRUPTED;
}
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -4037,12 +4042,8 @@ xfs_bmapi_allocate(
*/
if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->datatype = XFS_ALLOC_NOBUSY;
- if (whichfork == XFS_DATA_FORK) {
- if (bma->offset == 0)
- bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
- else
- bma->datatype |= XFS_ALLOC_USERDATA;
- }
+ if (whichfork == XFS_DATA_FORK && bma->offset == 0)
+ bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
if (bma->flags & XFS_BMAPI_ZERO)
bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
}
@@ -4388,12 +4389,9 @@ xfs_bmapi_write(
* If this is a CoW allocation, record the data in
* the refcount btree for orphan recovery.
*/
- if (whichfork == XFS_COW_FORK) {
- error = xfs_refcount_alloc_cow_extent(tp,
- bma.blkno, bma.length);
- if (error)
- goto error0;
- }
+ if (whichfork == XFS_COW_FORK)
+ xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+ bma.length);
}
/* Deal with the allocated space we found. */
@@ -4517,7 +4515,7 @@ xfs_bmapi_convert_delalloc(
if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
goto out_finish;
error = -EFSCORRUPTED;
- if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
goto out_finish;
XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
@@ -4527,12 +4525,8 @@ xfs_bmapi_convert_delalloc(
*imap = bma.got;
*seq = READ_ONCE(ifp->if_seq);
- if (whichfork == XFS_COW_FORK) {
- error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
- bma.length);
- if (error)
- goto out_finish;
- }
+ if (whichfork == XFS_COW_FORK)
+ xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
whichfork);
@@ -5136,18 +5130,14 @@ xfs_bmap_del_extent_real(
}
/* remove reverse mapping */
- error = xfs_rmap_unmap_extent(tp, ip, whichfork, del);
- if (error)
- goto done;
+ xfs_rmap_unmap_extent(tp, ip, whichfork, del);
/*
* If we need to, add to list of extents to delete.
*/
if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
- error = xfs_refcount_decrease_extent(tp, del);
- if (error)
- goto done;
+ xfs_refcount_decrease_extent(tp, del);
} else {
__xfs_bmap_add_free(tp, del->br_startblock,
del->br_blockcount, NULL,
@@ -5638,12 +5628,11 @@ done:
&new);
/* update reverse mapping. rmap functions merge the rmaps for us */
- error = xfs_rmap_unmap_extent(tp, ip, whichfork, got);
- if (error)
- return error;
+ xfs_rmap_unmap_extent(tp, ip, whichfork, got);
memcpy(&new, got, sizeof(new));
new.br_startoff = left->br_startoff + left->br_blockcount;
- return xfs_rmap_map_extent(tp, ip, whichfork, &new);
+ xfs_rmap_map_extent(tp, ip, whichfork, &new);
+ return 0;
}
static int
@@ -5682,10 +5671,9 @@ xfs_bmap_shift_update_extent(
got);
/* update reverse mapping */
- error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
- if (error)
- return error;
- return xfs_rmap_map_extent(tp, ip, whichfork, got);
+ xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
+ xfs_rmap_map_extent(tp, ip, whichfork, got);
+ return 0;
}
int
@@ -6081,7 +6069,7 @@ __xfs_bmap_add(
bmap->br_blockcount,
bmap->br_state);
- bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS);
+ bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
INIT_LIST_HEAD(&bi->bi_list);
bi->bi_type = type;
bi->bi_owner = ip;
@@ -6093,29 +6081,29 @@ __xfs_bmap_add(
}
/* Map an extent into a file. */
-int
+void
xfs_bmap_map_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_bmap_is_update_needed(PREV))
- return 0;
+ return;
- return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+ __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
}
/* Unmap an extent out of a file. */
-int
+void
xfs_bmap_unmap_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_bmap_is_update_needed(PREV))
- return 0;
+ return;
- return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+ __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
}
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 8f597f9abdbe..5bb446d80542 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -171,6 +171,13 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
!isnullstartblock(irec->br_startblock);
}
+/*
+ * Check the mapping for obviously garbage allocations that could trash the
+ * filesystem immediately.
+ */
+#define xfs_valid_startblock(ip, startblock) \
+ ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
+
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
@@ -254,9 +261,9 @@ int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
enum xfs_bmap_intent_type type, int whichfork,
xfs_fileoff_t startoff, xfs_fsblock_t startblock,
xfs_filblks_t *blockcount, xfs_exntst_t state);
-int xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
-int xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
static inline int xfs_bmap_fork_to_state(int whichfork)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fbb18ba5d905..ffe608d2a2d9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -400,8 +400,20 @@ xfs_bmbt_diff_two_keys(
union xfs_btree_key *k1,
union xfs_btree_key *k2)
{
- return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) -
- be64_to_cpu(k2->bmbt.br_startoff);
+ uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
+ uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
+
+ /*
+ * Note: This routine previously casted a and b to int64 and subtracted
+ * them to generate a result. This lead to problems if b was the
+ * "maximum" key value (all ones) being signed incorrectly, hence this
+ * somewhat less efficient version.
+ */
+ if (a > b)
+ return 1;
+ if (b > a)
+ return -1;
+ return 0;
}
static xfs_failaddr_t
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f1048efa4268..71de937f9e64 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4466,8 +4466,6 @@ xfs_btree_lblock_verify(
* btree block
*
* @bp: buffer containing the btree block
- * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
- * @pag_max_level: pointer to the per-ag max level field
*/
xfs_failaddr_t
xfs_btree_sblock_v5hdr_verify(
@@ -4600,7 +4598,7 @@ xfs_btree_simple_query_range(
/* Callback */
error = fn(cur, recp, priv);
- if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error)
break;
advloop:
@@ -4702,8 +4700,7 @@ pop_up:
*/
if (ldiff >= 0 && hdiff >= 0) {
error = fn(cur, recp, priv);
- if (error < 0 ||
- error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error)
break;
} else if (hdiff < 0) {
/* Record is larger than high key; pop. */
@@ -4774,8 +4771,7 @@ out:
* Query a btree for all records overlapping a given interval of keys. The
* supplied function will be called with each record found; return one of the
* XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
- * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
- * negative error code.
+ * code. This function returns -ECANCELED, zero, or a negative error code.
*/
int
xfs_btree_query_range(
@@ -4891,7 +4887,7 @@ xfs_btree_has_record_helper(
union xfs_btree_rec *rec,
void *priv)
{
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
}
/* Is there a record covering a given range of keys? */
@@ -4906,7 +4902,7 @@ xfs_btree_has_record(
error = xfs_btree_query_range(cur, low, high,
&xfs_btree_has_record_helper, NULL);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ if (error == -ECANCELED) {
*exists = true;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index fa3cd8ab9aba..ced1e65d1483 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -464,9 +464,13 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
-/* return codes */
-#define XFS_BTREE_QUERY_RANGE_CONTINUE (XFS_ITER_CONTINUE) /* keep iterating */
-#define XFS_BTREE_QUERY_RANGE_ABORT (XFS_ITER_ABORT) /* stop iterating */
+/*
+ * Return codes for the query range iterator function are 0 to continue
+ * iterating, and non-zero to stop iterating. Any non-zero value will be
+ * passed up to the _query_range caller. The special value -ECANCELED can be
+ * used to stop iteration, because _query_range never generates that error
+ * code on its own.
+ */
typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
union xfs_btree_rec *rec, void *priv);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index d1c77fd0815d..4fd1223c1bd5 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -487,10 +487,8 @@ xfs_da3_split(
ASSERT(state->path.active == 0);
oldblk = &state->path.blk[0];
error = xfs_da3_root_split(state, oldblk, addblk);
- if (error) {
- addblk->bp = NULL;
- return error; /* GROT: dir is inconsistent */
- }
+ if (error)
+ goto out;
/*
* Update pointers to the node which used to be block 0 and just got
@@ -505,7 +503,10 @@ xfs_da3_split(
*/
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
- ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
+ if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
node = addblk->bp->b_addr;
node->hdr.info.back = cpu_to_be32(oldblk->blkno);
xfs_trans_log_buf(state->args->trans, addblk->bp,
@@ -514,15 +515,19 @@ xfs_da3_split(
}
node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
- ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
+ if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
node = addblk->bp->b_addr;
node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
xfs_trans_log_buf(state->args->trans, addblk->bp,
XFS_DA_LOGRANGE(node, &node->hdr.info,
sizeof(node->hdr.info)));
}
+out:
addblk->bp = NULL;
- return 0;
+ return error;
}
/*
@@ -2093,7 +2098,7 @@ xfs_da_grow_inode_int(
* If we didn't get it and the block might work if fragmented,
* try without the CONTIG flag. Loop until we get it all.
*/
- mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+ mapp = kmem_alloc(sizeof(*mapp) * count, 0);
for (b = *bno, mapi = 0; b < *bno + count; ) {
nmap = min(XFS_BMAP_MAX_NMAP, count);
c = (int)(*bno + count - b);
@@ -2475,7 +2480,7 @@ xfs_buf_map_from_irec(
if (nirecs > 1) {
map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
if (!map)
return -ENOMEM;
*mapp = map;
@@ -2534,7 +2539,7 @@ xfs_dabuf_map(
*/
if (nfsb != 1)
irecs = kmem_zalloc(sizeof(irec) * nfsb,
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
nirecs = nfsb;
error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 84dd865b6c3d..ae0bbd20d9ca 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -81,13 +81,15 @@ typedef struct xfs_da_args {
#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
+#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
{ XFS_DA_OP_RENAME, "RENAME" }, \
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
- { XFS_DA_OP_CILOOKUP, "CILOOKUP" }
+ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
+ { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }
/*
* Storage for holding state during Btree searches and split/join ops.
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index eb2be2a6a25a..22557527cfdb 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -517,7 +517,7 @@ xfs_defer_add(
}
if (!dfp) {
dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
dfp->dfp_type = type;
dfp->dfp_intent = NULL;
dfp->dfp_done = NULL;
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 67840723edbb..867c5dee0751 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -110,9 +110,9 @@ xfs_da_mount(
nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
- KM_SLEEP | KM_MAYFAIL);
+ KM_MAYFAIL);
mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
- KM_SLEEP | KM_MAYFAIL);
+ KM_MAYFAIL);
if (!mp->m_dir_geo || !mp->m_attr_geo) {
kmem_free(mp->m_dir_geo);
kmem_free(mp->m_attr_geo);
@@ -217,7 +217,7 @@ xfs_dir_init(
if (error)
return error;
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
if (!args)
return -ENOMEM;
@@ -254,7 +254,7 @@ xfs_dir_createname(
XFS_STATS_INC(dp->i_mount, xs_dir_create);
}
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
if (!args)
return -ENOMEM;
@@ -353,7 +353,7 @@ xfs_dir_lookup(
* lockdep Doing this avoids having to add a bunch of lockdep class
* annotations into the reclaim path for the ilock.
*/
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
args->namelen = name->len;
@@ -422,7 +422,7 @@ xfs_dir_removename(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
if (!args)
return -ENOMEM;
@@ -483,7 +483,7 @@ xfs_dir_replace(
if (rval)
return rval;
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
if (!args)
return -ENOMEM;
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index a6fb0cc2085e..9595ced393dc 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1092,7 +1092,7 @@ xfs_dir2_sf_to_block(
* Copy the directory into a temporary buffer.
* Then pitch the incore inode data so we can make extents.
*/
- sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+ sfp = kmem_alloc(ifp->if_bytes, 0);
memcpy(sfp, oldsfp, ifp->if_bytes);
xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index afcc6642690a..705c4f562758 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -32,8 +32,6 @@ static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
int index, xfs_da_state_blk_t *dblk,
int *rval);
-static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
- xfs_da_state_blk_t *fblk);
/*
* Check internal consistency of a leafn block.
@@ -741,7 +739,8 @@ xfs_dir2_leafn_lookup_for_entry(
ents = dp->d_ops->leaf_ents_p(leaf);
xfs_dir3_leaf_check(dp, bp);
- ASSERT(leafhdr.count > 0);
+ if (leafhdr.count <= 0)
+ return -EFSCORRUPTED;
/*
* Look up the hash value in the leaf entries.
@@ -1610,113 +1609,152 @@ xfs_dir2_leafn_unbalance(
}
/*
- * Top-level node form directory addname routine.
+ * Add a new data block to the directory at the free space index that the caller
+ * has specified.
*/
-int /* error */
-xfs_dir2_node_addname(
- xfs_da_args_t *args) /* operation arguments */
+static int
+xfs_dir2_node_add_datablk(
+ struct xfs_da_args *args,
+ struct xfs_da_state_blk *fblk,
+ xfs_dir2_db_t *dbno,
+ struct xfs_buf **dbpp,
+ struct xfs_buf **fbpp,
+ int *findex)
{
- xfs_da_state_blk_t *blk; /* leaf block for insert */
- int error; /* error return value */
- int rval; /* sub-return value */
- xfs_da_state_t *state; /* btree cursor */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_dir2_data_free *bf;
+ struct xfs_dir2_data_hdr *hdr;
+ struct xfs_dir2_free *free = NULL;
+ xfs_dir2_db_t fbno;
+ struct xfs_buf *fbp;
+ struct xfs_buf *dbp;
+ __be16 *bests = NULL;
+ int error;
- trace_xfs_dir2_node_addname(args);
+ /* Not allowed to allocate, return failure. */
+ if (args->total == 0)
+ return -ENOSPC;
+
+ /* Allocate and initialize the new data block. */
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno);
+ if (error)
+ return error;
+ error = xfs_dir3_data_init(args, *dbno, &dbp);
+ if (error)
+ return error;
/*
- * Allocate and initialize the state (btree cursor).
- */
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
- /*
- * Look up the name. We're not supposed to find it, but
- * this gives us the insertion point.
+ * Get the freespace block corresponding to the data block
+ * that was just allocated.
*/
- error = xfs_da3_node_lookup_int(state, &rval);
+ fbno = dp->d_ops->db_to_fdb(args->geo, *dbno);
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fbno), &fbp);
if (error)
- rval = error;
- if (rval != -ENOENT) {
- goto done;
- }
+ return error;
+
/*
- * Add the data entry to a data block.
- * Extravalid is set to a freeblock found by lookup.
+ * If there wasn't a freespace block, the read will
+ * return a NULL fbp. Allocate and initialize a new one.
*/
- rval = xfs_dir2_node_addname_int(args,
- state->extravalid ? &state->extrablk : NULL);
- if (rval) {
- goto done;
+ if (!fbp) {
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno);
+ if (error)
+ return error;
+
+ if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) {
+ xfs_alert(mp,
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
+ __func__, (unsigned long long)dp->i_ino,
+ (long long)dp->d_ops->db_to_fdb(args->geo, *dbno),
+ (long long)*dbno, (long long)fbno);
+ if (fblk) {
+ xfs_alert(mp,
+ " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
+ fblk, (unsigned long long)fblk->blkno,
+ fblk->index, fblk->magic);
+ } else {
+ xfs_alert(mp, " ... fblk is NULL");
+ }
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ /* Get a buffer for the new block. */
+ error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+ if (error)
+ return error;
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+ /* Remember the first slot as our empty slot. */
+ freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)) *
+ dp->d_ops->free_max_bests(args->geo);
+ } else {
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
}
- blk = &state->path.blk[state->path.active - 1];
- ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ /* Set the freespace block index from the data block number. */
+ *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno);
+
+ /* Extend the freespace table if the new data block is off the end. */
+ if (*findex >= freehdr.nvalid) {
+ ASSERT(*findex < dp->d_ops->free_max_bests(args->geo));
+ freehdr.nvalid = *findex + 1;
+ bests[*findex] = cpu_to_be16(NULLDATAOFF);
+ }
+
/*
- * Add the new leaf entry.
+ * If this entry was for an empty data block (this should always be
+ * true) then update the header.
*/
- rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
- if (rval == 0) {
- /*
- * It worked, fix the hash values up the btree.
- */
- if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
- xfs_da3_fixhashpath(state, &state->path);
- } else {
- /*
- * It didn't work, we need to split the leaf block.
- */
- if (args->total == 0) {
- ASSERT(rval == -ENOSPC);
- goto done;
- }
- /*
- * Split the leaf block and insert the new entry.
- */
- rval = xfs_da3_split(state);
+ if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
+ freehdr.nused++;
+ dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_header(args, fbp);
}
-done:
- xfs_da_state_free(state);
- return rval;
+
+ /* Update the freespace value for the new block in the table. */
+ hdr = dbp->b_addr;
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ bests[*findex] = bf[0].length;
+
+ *dbpp = dbp;
+ *fbpp = fbp;
+ return 0;
}
-/*
- * Add the data entry for a node-format directory name addition.
- * The leaf entry is added in xfs_dir2_leafn_add.
- * We may enter with a freespace block that the lookup found.
- */
-static int /* error */
-xfs_dir2_node_addname_int(
- xfs_da_args_t *args, /* operation arguments */
- xfs_da_state_blk_t *fblk) /* optional freespace block */
+static int
+xfs_dir2_node_find_freeblk(
+ struct xfs_da_args *args,
+ struct xfs_da_state_blk *fblk,
+ xfs_dir2_db_t *dbnop,
+ struct xfs_buf **fbpp,
+ int *findexp,
+ int length)
{
- xfs_dir2_data_hdr_t *hdr; /* data block header */
- xfs_dir2_db_t dbno; /* data block number */
- struct xfs_buf *dbp; /* data block buffer */
- xfs_dir2_data_entry_t *dep; /* data entry pointer */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* data unused entry pointer */
- int error; /* error return value */
- xfs_dir2_db_t fbno; /* freespace block number */
- struct xfs_buf *fbp; /* freespace buffer */
- int findex; /* freespace entry index */
- xfs_dir2_free_t *free=NULL; /* freespace block structure */
- xfs_dir2_db_t ifbno; /* initial freespace block no */
- xfs_dir2_db_t lastfbno=0; /* highest freespace block no */
- int length; /* length of the new entry */
- int logfree; /* need to log free entry */
- xfs_mount_t *mp; /* filesystem mount point */
- int needlog; /* need to log data header */
- int needscan; /* need to rescan data frees */
- __be16 *tagp; /* data entry tag pointer */
- xfs_trans_t *tp; /* transaction pointer */
- __be16 *bests;
struct xfs_dir3_icfree_hdr freehdr;
- struct xfs_dir2_data_free *bf;
- xfs_dir2_data_aoff_t aoff;
+ struct xfs_dir2_free *free = NULL;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_buf *fbp = NULL;
+ xfs_dir2_db_t firstfbno;
+ xfs_dir2_db_t lastfbno;
+ xfs_dir2_db_t ifbno = -1;
+ xfs_dir2_db_t dbno = -1;
+ xfs_dir2_db_t fbno;
+ xfs_fileoff_t fo;
+ __be16 *bests = NULL;
+ int findex = 0;
+ int error;
- dp = args->dp;
- mp = dp->i_mount;
- tp = args->trans;
- length = dp->d_ops->data_entsize(args->namelen);
/*
* If we came in with a freespace block that means that lookup
* found an entry with our hash value. This is the freespace
@@ -1724,288 +1762,157 @@ xfs_dir2_node_addname_int(
*/
if (fblk) {
fbp = fblk->bp;
- /*
- * Remember initial freespace block number.
- */
- ifbno = fblk->blkno;
free = fbp->b_addr;
findex = fblk->index;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
- /*
- * This means the free entry showed that the data block had
- * space for our entry, so we remembered it.
- * Use that data block.
- */
if (findex >= 0) {
+ /* caller already found the freespace for us. */
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
ASSERT(findex < freehdr.nvalid);
ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
ASSERT(be16_to_cpu(bests[findex]) >= length);
dbno = freehdr.firstdb + findex;
- } else {
- /*
- * The data block looked at didn't have enough room.
- * We'll start at the beginning of the freespace entries.
- */
- dbno = -1;
- findex = 0;
+ goto found_block;
}
- } else {
+
/*
- * Didn't come in with a freespace block, so no data block.
+ * The data block looked at didn't have enough room.
+ * We'll start at the beginning of the freespace entries.
*/
- ifbno = dbno = -1;
+ ifbno = fblk->blkno;
+ xfs_trans_brelse(tp, fbp);
fbp = NULL;
- findex = 0;
+ fblk->bp = NULL;
}
/*
- * If we don't have a data block yet, we're going to scan the
- * freespace blocks looking for one. Figure out what the
- * highest freespace block number is.
- */
- if (dbno == -1) {
- xfs_fileoff_t fo; /* freespace block number */
-
- if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
- return error;
- lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
- fbno = ifbno;
- }
- /*
- * While we haven't identified a data block, search the freeblock
- * data for a good data block. If we find a null freeblock entry,
- * indicating a hole in the data blocks, remember that.
+ * If we don't have a data block yet, we're going to scan the freespace
+ * data for a data block with enough free space in it.
*/
- while (dbno == -1) {
- /*
- * If we don't have a freeblock in hand, get the next one.
- */
- if (fbp == NULL) {
- /*
- * Happens the first time through unless lookup gave
- * us a freespace block to start with.
- */
- if (++fbno == 0)
- fbno = xfs_dir2_byte_to_db(args->geo,
- XFS_DIR2_FREE_OFFSET);
- /*
- * If it's ifbno we already looked at it.
- */
- if (fbno == ifbno)
- fbno++;
- /*
- * If it's off the end we're done.
- */
- if (fbno >= lastfbno)
- break;
- /*
- * Read the block. There can be holes in the
- * freespace blocks, so this might not succeed.
- * This should be really rare, so there's no reason
- * to avoid it.
- */
- error = xfs_dir2_free_try_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, fbno),
- &fbp);
- if (error)
- return error;
- if (!fbp)
- continue;
- free = fbp->b_addr;
- findex = 0;
- }
- /*
- * Look at the current free entry. Is it good enough?
- *
- * The bests initialisation should be where the bufer is read in
- * the above branch. But gcc is too stupid to realise that bests
- * and the freehdr are actually initialised if they are placed
- * there, so we have to do it here to avoid warnings. Blech.
- */
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
- if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
- be16_to_cpu(bests[findex]) >= length)
- dbno = freehdr.firstdb + findex;
- else {
- /*
- * Are we done with the freeblock?
- */
- if (++findex == freehdr.nvalid) {
- /*
- * Drop the block.
- */
- xfs_trans_brelse(tp, fbp);
- fbp = NULL;
- if (fblk && fblk->bp)
- fblk->bp = NULL;
- }
- }
- }
- /*
- * If we don't have a data block, we need to allocate one and make
- * the freespace entries refer to it.
- */
- if (unlikely(dbno == -1)) {
- /*
- * Not allowed to allocate, return failure.
- */
- if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
- return -ENOSPC;
-
- /*
- * Allocate and initialize the new data block.
- */
- if (unlikely((error = xfs_dir2_grow_inode(args,
- XFS_DIR2_DATA_SPACE,
- &dbno)) ||
- (error = xfs_dir3_data_init(args, dbno, &dbp))))
- return error;
+ error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK);
+ if (error)
+ return error;
+ lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
+ firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
- /*
- * If (somehow) we have a freespace block, get rid of it.
- */
- if (fbp)
- xfs_trans_brelse(tp, fbp);
- if (fblk && fblk->bp)
- fblk->bp = NULL;
+ for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) {
+ /* If it's ifbno we already looked at it. */
+ if (fbno == ifbno)
+ continue;
/*
- * Get the freespace block corresponding to the data block
- * that was just allocated.
+ * Read the block. There can be holes in the freespace blocks,
+ * so this might not succeed. This should be really rare, so
+ * there's no reason to avoid it.
*/
- fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
error = xfs_dir2_free_try_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, fbno),
- &fbp);
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
if (error)
return error;
+ if (!fbp)
+ continue;
- /*
- * If there wasn't a freespace block, the read will
- * return a NULL fbp. Allocate and initialize a new one.
- */
- if (!fbp) {
- error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
- &fbno);
- if (error)
- return error;
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
- if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
- xfs_alert(mp,
-"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d",
- __func__, (unsigned long long)dp->i_ino,
- (long long)dp->d_ops->db_to_fdb(
- args->geo, dbno),
- (long long)dbno, (long long)fbno,
- (unsigned long long)ifbno, lastfbno);
- if (fblk) {
- xfs_alert(mp,
- " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
- fblk,
- (unsigned long long)fblk->blkno,
- fblk->index,
- fblk->magic);
- } else {
- xfs_alert(mp, " ... fblk is NULL");
- }
- XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
- XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
+ /* Scan the free entry array for a large enough free space. */
+ for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
+ if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+ be16_to_cpu(bests[findex]) >= length) {
+ dbno = freehdr.firstdb + findex;
+ goto found_block;
}
-
- /*
- * Get a buffer for the new block.
- */
- error = xfs_dir3_free_get_buf(args, fbno, &fbp);
- if (error)
- return error;
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
- /*
- * Remember the first slot as our empty slot.
- */
- freehdr.firstdb =
- (fbno - xfs_dir2_byte_to_db(args->geo,
- XFS_DIR2_FREE_OFFSET)) *
- dp->d_ops->free_max_bests(args->geo);
- } else {
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
}
- /*
- * Set the freespace block index from the data block number.
- */
- findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
- /*
- * If it's after the end of the current entries in the
- * freespace block, extend that table.
- */
- if (findex >= freehdr.nvalid) {
- ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
- freehdr.nvalid = findex + 1;
- /*
- * Tag new entry so nused will go up.
- */
- bests[findex] = cpu_to_be16(NULLDATAOFF);
- }
- /*
- * If this entry was for an empty data block
- * (this should always be true) then update the header.
- */
- if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
- freehdr.nused++;
- dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
- xfs_dir2_free_log_header(args, fbp);
- }
- /*
- * Update the real value in the table.
- * We haven't allocated the data entry yet so this will
- * change again.
- */
- hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
- bests[findex] = bf[0].length;
- logfree = 1;
+ /* Didn't find free space, go on to next free block */
+ xfs_trans_brelse(tp, fbp);
}
+
+found_block:
+ *dbnop = dbno;
+ *fbpp = fbp;
+ *findexp = findex;
+ return 0;
+}
+
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int
+xfs_dir2_node_addname_int(
+ struct xfs_da_args *args, /* operation arguments */
+ struct xfs_da_state_blk *fblk) /* optional freespace block */
+{
+ struct xfs_dir2_data_unused *dup; /* data unused entry pointer */
+ struct xfs_dir2_data_entry *dep; /* data entry pointer */
+ struct xfs_dir2_data_hdr *hdr; /* data block header */
+ struct xfs_dir2_data_free *bf;
+ struct xfs_dir2_free *free = NULL; /* freespace block structure */
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *dbp; /* data block buffer */
+ struct xfs_buf *fbp; /* freespace buffer */
+ xfs_dir2_data_aoff_t aoff;
+ xfs_dir2_db_t dbno; /* data block number */
+ int error; /* error return value */
+ int findex; /* freespace entry index */
+ int length; /* length of the new entry */
+ int logfree = 0; /* need to log free entry */
+ int needlog = 0; /* need to log data header */
+ int needscan = 0; /* need to rescan data frees */
+ __be16 *tagp; /* data entry tag pointer */
+ __be16 *bests;
+
+ length = dp->d_ops->data_entsize(args->namelen);
+ error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex,
+ length);
+ if (error)
+ return error;
+
/*
- * We had a data block so we don't have to make a new one.
+ * Now we know if we must allocate blocks, so if we are checking whether
+ * we can insert without allocation then we can return now.
*/
- else {
- /*
- * If just checking, we succeeded.
- */
- if (args->op_flags & XFS_DA_OP_JUSTCHECK)
- return 0;
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+ if (dbno == -1)
+ return -ENOSPC;
+ return 0;
+ }
- /*
- * Read the data block in.
- */
+ /*
+ * If we don't have a data block, we need to allocate one and make
+ * the freespace entries refer to it.
+ */
+ if (dbno == -1) {
+ /* we're going to have to log the free block index later */
+ logfree = 1;
+ error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
+ &findex);
+ } else {
+ /* Read the data block in. */
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo, dbno),
-1, &dbp);
- if (error)
- return error;
- hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
- logfree = 0;
}
+ if (error)
+ return error;
+
+ /* setup for data block up now */
+ hdr = dbp->b_addr;
+ bf = dp->d_ops->data_bestfree_p(hdr);
ASSERT(be16_to_cpu(bf[0].length) >= length);
- /*
- * Point to the existing unused space.
- */
+
+ /* Point to the existing unused space. */
dup = (xfs_dir2_data_unused_t *)
((char *)hdr + be16_to_cpu(bf[0].offset));
- needscan = needlog = 0;
- /*
- * Mark the first part of the unused space, inuse for us.
- */
+
+ /* Mark the first part of the unused space, inuse for us. */
aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
&needlog, &needscan);
@@ -2013,9 +1920,8 @@ xfs_dir2_node_addname_int(
xfs_trans_brelse(tp, dbp);
return error;
}
- /*
- * Fill in the new entry and log it.
- */
+
+ /* Fill in the new entry and log it. */
dep = (xfs_dir2_data_entry_t *)dup;
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
@@ -2024,38 +1930,101 @@ xfs_dir2_node_addname_int(
tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
xfs_dir2_data_log_entry(args, dbp, dep);
- /*
- * Rescan the block for bestfree if needed.
- */
+
+ /* Rescan the freespace and log the data block if needed. */
if (needscan)
xfs_dir2_data_freescan(dp, hdr, &needlog);
- /*
- * Log the data block header if needed.
- */
if (needlog)
xfs_dir2_data_log_header(args, dbp);
- /*
- * If the freespace entry is now wrong, update it.
- */
- bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
- if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+
+ /* If the freespace block entry is now wrong, update it. */
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ if (bests[findex] != bf[0].length) {
bests[findex] = bf[0].length;
logfree = 1;
}
- /*
- * Log the freespace entry if needed.
- */
+
+ /* Log the freespace entry if needed. */
if (logfree)
xfs_dir2_free_log_bests(args, fbp, findex, findex);
- /*
- * Return the data block and offset in args, then drop the data block.
- */
+
+ /* Return the data block and offset in args. */
args->blkno = (xfs_dablk_t)dbno;
args->index = be16_to_cpu(*tagp);
return 0;
}
/*
+ * Top-level node form directory addname routine.
+ */
+int /* error */
+xfs_dir2_node_addname(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_da_state_blk_t *blk; /* leaf block for insert */
+ int error; /* error return value */
+ int rval; /* sub-return value */
+ xfs_da_state_t *state; /* btree cursor */
+
+ trace_xfs_dir2_node_addname(args);
+
+ /*
+ * Allocate and initialize the state (btree cursor).
+ */
+ state = xfs_da_state_alloc();
+ state->args = args;
+ state->mp = args->dp->i_mount;
+ /*
+ * Look up the name. We're not supposed to find it, but
+ * this gives us the insertion point.
+ */
+ error = xfs_da3_node_lookup_int(state, &rval);
+ if (error)
+ rval = error;
+ if (rval != -ENOENT) {
+ goto done;
+ }
+ /*
+ * Add the data entry to a data block.
+ * Extravalid is set to a freeblock found by lookup.
+ */
+ rval = xfs_dir2_node_addname_int(args,
+ state->extravalid ? &state->extrablk : NULL);
+ if (rval) {
+ goto done;
+ }
+ blk = &state->path.blk[state->path.active - 1];
+ ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ /*
+ * Add the new leaf entry.
+ */
+ rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+ if (rval == 0) {
+ /*
+ * It worked, fix the hash values up the btree.
+ */
+ if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+ xfs_da3_fixhashpath(state, &state->path);
+ } else {
+ /*
+ * It didn't work, we need to split the leaf block.
+ */
+ if (args->total == 0) {
+ ASSERT(rval == -ENOSPC);
+ goto done;
+ }
+ /*
+ * Split the leaf block and insert the new entry.
+ */
+ rval = xfs_da3_split(state);
+ }
+done:
+ xfs_da_state_free(state);
+ return rval;
+}
+
+/*
* Lookup an entry in a node-format directory.
* All the real work happens in xfs_da3_node_lookup_int.
* The only real output is the inode number of the entry.
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 033589257f54..85f14fc2a8da 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -164,7 +164,7 @@ xfs_dir2_block_to_sf(
* can free the block and copy the formatted data into the inode literal
* area.
*/
- dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+ dst = kmem_alloc(mp->m_sb.sb_inodesize, 0);
hdr = bp->b_addr;
/*
@@ -436,7 +436,7 @@ xfs_dir2_sf_addname_hard(
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
old_isize = (int)dp->i_d.di_size;
- buf = kmem_alloc(old_isize, KM_SLEEP);
+ buf = kmem_alloc(old_isize, 0);
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
memcpy(oldsfp, sfp, old_isize);
/*
@@ -1096,7 +1096,7 @@ xfs_dir2_sf_toino4(
* Don't want xfs_idata_realloc copying the data here.
*/
oldsize = dp->i_df.if_bytes;
- buf = kmem_alloc(oldsize, KM_SLEEP);
+ buf = kmem_alloc(oldsize, 0);
oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
ASSERT(oldsfp->i8count == 1);
memcpy(buf, oldsfp, oldsize);
@@ -1169,7 +1169,7 @@ xfs_dir2_sf_toino8(
* Don't want xfs_idata_realloc copying the data here.
*/
oldsize = dp->i_df.if_bytes;
- buf = kmem_alloc(oldsize, KM_SLEEP);
+ buf = kmem_alloc(oldsize, 0);
oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 52d03a3a02a4..39dd2b908106 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -287,7 +287,7 @@ struct xfs_ag_geometry {
uint32_t ag_ifree; /* o: inodes free */
uint32_t ag_sick; /* o: sick things in ag */
uint32_t ag_checked; /* o: checked metadata in ag */
- uint32_t ag_reserved32; /* o: zero */
+ uint32_t ag_flags; /* i/o: flags for this ag */
uint64_t ag_reserved[12];/* o: zero */
};
#define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 04377ab75863..588d44613094 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2787,8 +2787,13 @@ xfs_ialloc_setup_geometry(
igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
inodes);
- /* Set the maximum inode count for this filesystem. */
- if (sbp->sb_imax_pct) {
+ /*
+ * Set the maximum inode count for this filesystem, being careful not
+ * to use obviously garbage sb_inopblog/sb_inopblock values. Regular
+ * users should never get here due to failing sb verification, but
+ * certain users (xfs_db) need to be usable even with corrupt metadata.
+ */
+ if (sbp->sb_imax_pct && igeo->ialloc_blks) {
/*
* Make sure the maximum inode count is a multiple
* of the units we allocate inodes in.
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 27aa3f2bc4bc..7bc87408f1a0 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -616,7 +616,7 @@ xfs_iext_realloc_root(
* sequence counter is seen before the modifications to the extent tree itself
* take effect.
*/
-static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
+static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
{
WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
}
@@ -633,7 +633,7 @@ xfs_iext_insert(
struct xfs_iext_leaf *new = NULL;
int nr_entries, i;
- xfs_iext_inc_seq(ifp, state);
+ xfs_iext_inc_seq(ifp);
if (ifp->if_height == 0)
xfs_iext_alloc_root(ifp, cur);
@@ -875,7 +875,7 @@ xfs_iext_remove(
ASSERT(ifp->if_u1.if_root != NULL);
ASSERT(xfs_iext_valid(ifp, cur));
- xfs_iext_inc_seq(ifp, state);
+ xfs_iext_inc_seq(ifp);
nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
for (i = cur->pos; i < nr_entries; i++)
@@ -983,7 +983,7 @@ xfs_iext_update_extent(
{
struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
- xfs_iext_inc_seq(ifp, state);
+ xfs_iext_inc_seq(ifp);
if (cur->pos == 0) {
struct xfs_bmbt_irec old;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index bf3e04018246..c643beeb5a24 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -94,7 +94,7 @@ xfs_iformat_fork(
return 0;
ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
switch (dip->di_aformat) {
case XFS_DINODE_FMT_LOCAL:
@@ -147,7 +147,7 @@ xfs_init_local_fork(
if (size) {
real_size = roundup(mem_size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+ ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
memcpy(ifp->if_u1.if_data, data, size);
if (zero_terminate)
ifp->if_u1.if_data[size] = '\0';
@@ -302,7 +302,7 @@ xfs_iformat_btree(
}
ifp->if_broot_bytes = size;
- ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+ ifp->if_broot = kmem_alloc(size, KM_NOFS);
ASSERT(ifp->if_broot != NULL);
/*
* Copy and convert from the on-disk structure
@@ -367,7 +367,7 @@ xfs_iroot_realloc(
*/
if (ifp->if_broot_bytes == 0) {
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
- ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+ ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
ifp->if_broot_bytes = (int)new_size;
return;
}
@@ -382,7 +382,7 @@ xfs_iroot_realloc(
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -408,7 +408,7 @@ xfs_iroot_realloc(
else
new_size = 0;
if (new_size > 0) {
- new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+ new_broot = kmem_alloc(new_size, KM_NOFS);
/*
* First copy over the btree block header.
*/
@@ -492,7 +492,7 @@ xfs_idata_realloc(
* We enforce that here.
*/
ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
- roundup(new_size, 4), KM_SLEEP | KM_NOFS);
+ roundup(new_size, 4), KM_NOFS);
ifp->if_bytes = new_size;
}
@@ -683,7 +683,7 @@ xfs_ifork_init_cow(
return;
ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
ip->i_cowfp->if_flags = XFS_IFEXTENTS;
ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
ip->i_cnextents = 0;
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 51bb9bdb0e84..9a7fadb1361c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1174,7 +1174,7 @@ out_cur:
/*
* Record a refcount intent for later processing.
*/
-static int
+static void
__xfs_refcount_add(
struct xfs_trans *tp,
enum xfs_refcount_intent_type type,
@@ -1189,44 +1189,43 @@ __xfs_refcount_add(
blockcount);
ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_startblock = startblock;
ri->ri_blockcount = blockcount;
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
- return 0;
}
/*
* Increase the reference count of the blocks backing a file's extent.
*/
-int
+void
xfs_refcount_increase_extent(
struct xfs_trans *tp,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
- return 0;
+ return;
- return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE,
- PREV->br_startblock, PREV->br_blockcount);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+ PREV->br_blockcount);
}
/*
* Decrease the reference count of the blocks backing a file's extent.
*/
-int
+void
xfs_refcount_decrease_extent(
struct xfs_trans *tp,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
- return 0;
+ return;
- return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE,
- PREV->br_startblock, PREV->br_blockcount);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+ PREV->br_blockcount);
}
/*
@@ -1541,47 +1540,40 @@ __xfs_refcount_cow_free(
}
/* Record a CoW staging extent in the refcount btree. */
-int
+void
xfs_refcount_alloc_cow_extent(
struct xfs_trans *tp,
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
struct xfs_mount *mp = tp->t_mountp;
- int error;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
- return 0;
+ return;
- error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
- if (error)
- return error;
+ __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
/* Add rmap entry */
- return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+ xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
}
/* Forget a CoW staging event in the refcount btree. */
-int
+void
xfs_refcount_free_cow_extent(
struct xfs_trans *tp,
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
struct xfs_mount *mp = tp->t_mountp;
- int error;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
- return 0;
+ return;
/* Remove rmap entry */
- error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+ xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
- if (error)
- return error;
-
- return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
}
struct xfs_refcount_recovery {
@@ -1602,7 +1594,7 @@ xfs_refcount_recover_extent(
if (be32_to_cpu(rec->refc.rc_refcount) != 1)
return -EFSCORRUPTED;
- rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP);
+ rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
list_add_tail(&rr->rr_list, debris);
@@ -1679,10 +1671,8 @@ xfs_refcount_recover_cow_leftovers(
/* Free the orphan record */
agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
- error = xfs_refcount_free_cow_extent(tp, fsb,
+ xfs_refcount_free_cow_extent(tp, fsb,
rr->rr_rrec.rc_blockcount);
- if (error)
- goto out_trans;
/* Free the block. */
xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 1d9c518575e7..209795539c8d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -29,9 +29,9 @@ struct xfs_refcount_intent {
xfs_extlen_t ri_blockcount;
};
-extern int xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp,
struct xfs_bmbt_irec *irec);
-extern int xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp,
struct xfs_bmbt_irec *irec);
extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
@@ -45,10 +45,10 @@ extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
xfs_extlen_t *flen, bool find_end_of_shared);
-extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp,
- xfs_fsblock_t fsb, xfs_extlen_t len);
-extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp,
- xfs_fsblock_t fsb, xfs_extlen_t len);
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+ xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+ xfs_extlen_t len);
extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
xfs_agnumber_t agno);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index e6aeb390b2fb..38e9414878b3 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -168,7 +168,6 @@ xfs_rmap_btrec_to_irec(
union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec)
{
- irec->rm_flags = 0;
irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
@@ -254,15 +253,15 @@ xfs_rmap_find_left_neighbor_helper(
rec->rm_flags);
if (rec->rm_owner != info->high.rm_owner)
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
*info->irec = *rec;
*info->stat = 1;
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
}
/*
@@ -305,7 +304,7 @@ xfs_rmap_find_left_neighbor(
error = xfs_rmap_query_range(cur, &info.high, &info.high,
xfs_rmap_find_left_neighbor_helper, &info);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error == -ECANCELED)
error = 0;
if (*stat)
trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
@@ -330,16 +329,16 @@ xfs_rmap_lookup_le_range_helper(
rec->rm_flags);
if (rec->rm_owner != info->high.rm_owner)
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
(rec->rm_offset > info->high.rm_offset ||
rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
*info->irec = *rec;
*info->stat = 1;
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
}
/*
@@ -377,7 +376,7 @@ xfs_rmap_lookup_le_range(
cur->bc_private.a.agno, bno, 0, owner, offset, flags);
error = xfs_rmap_query_range(cur, &info.high, &info.high,
xfs_rmap_lookup_le_range_helper, &info);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error == -ECANCELED)
error = 0;
if (*stat)
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
@@ -2268,7 +2267,7 @@ xfs_rmap_update_is_needed(
* Record a rmap intent; the list is kept sorted first by AG and then by
* increasing age.
*/
-static int
+static void
__xfs_rmap_add(
struct xfs_trans *tp,
enum xfs_rmap_intent_type type,
@@ -2287,7 +2286,7 @@ __xfs_rmap_add(
bmap->br_blockcount,
bmap->br_state);
- ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS);
+ ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_owner = owner;
@@ -2295,11 +2294,10 @@ __xfs_rmap_add(
ri->ri_bmap = *bmap;
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
- return 0;
}
/* Map an extent into a file. */
-int
+void
xfs_rmap_map_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
@@ -2307,15 +2305,15 @@ xfs_rmap_map_extent(
struct xfs_bmbt_irec *PREV)
{
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
- return 0;
+ return;
- return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+ __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
whichfork, PREV);
}
/* Unmap an extent out of a file. */
-int
+void
xfs_rmap_unmap_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
@@ -2323,9 +2321,9 @@ xfs_rmap_unmap_extent(
struct xfs_bmbt_irec *PREV)
{
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
- return 0;
+ return;
- return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+ __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
whichfork, PREV);
}
@@ -2336,7 +2334,7 @@ xfs_rmap_unmap_extent(
* Note that tp can be NULL here as no transaction is used for COW fork
* unwritten conversion.
*/
-int
+void
xfs_rmap_convert_extent(
struct xfs_mount *mp,
struct xfs_trans *tp,
@@ -2345,15 +2343,15 @@ xfs_rmap_convert_extent(
struct xfs_bmbt_irec *PREV)
{
if (!xfs_rmap_update_is_needed(mp, whichfork))
- return 0;
+ return;
- return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+ __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
whichfork, PREV);
}
/* Schedule the creation of an rmap for non-file data. */
-int
+void
xfs_rmap_alloc_extent(
struct xfs_trans *tp,
xfs_agnumber_t agno,
@@ -2364,18 +2362,18 @@ xfs_rmap_alloc_extent(
struct xfs_bmbt_irec bmap;
if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
- return 0;
+ return;
bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
bmap.br_blockcount = len;
bmap.br_startoff = 0;
bmap.br_state = XFS_EXT_NORM;
- return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+ __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
}
/* Schedule the deletion of an rmap for non-file data. */
-int
+void
xfs_rmap_free_extent(
struct xfs_trans *tp,
xfs_agnumber_t agno,
@@ -2386,14 +2384,14 @@ xfs_rmap_free_extent(
struct xfs_bmbt_irec bmap;
if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
- return 0;
+ return;
bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
bmap.br_blockcount = len;
bmap.br_startoff = 0;
bmap.br_state = XFS_EXT_NORM;
- return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+ __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
}
/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */
@@ -2511,7 +2509,7 @@ xfs_rmap_has_other_keys_helper(
((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
return 0;
rks->has_rmap = true;
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
}
/*
@@ -2540,8 +2538,11 @@ xfs_rmap_has_other_keys(
error = xfs_rmap_query_range(cur, &low, &high,
xfs_rmap_has_other_keys_helper, &rks);
+ if (error < 0)
+ return error;
+
*has_rmap = rks.has_rmap;
- return error;
+ return 0;
}
const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index e21ed0294e5c..abe633403fd1 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -68,6 +68,7 @@ xfs_rmap_irec_offset_unpack(
if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
return -EFSCORRUPTED;
irec->rm_offset = XFS_RMAP_OFF(offset);
+ irec->rm_flags = 0;
if (offset & XFS_RMAP_OFF_ATTR_FORK)
irec->rm_flags |= XFS_RMAP_ATTR_FORK;
if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
@@ -161,16 +162,16 @@ struct xfs_rmap_intent {
};
/* functions for updating the rmapbt based on bmbt map/unmap operations */
-int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
+void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *imap);
-int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index e0641b7337b3..c45acbd3add9 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -177,10 +177,4 @@ struct xfs_ino_geometry {
unsigned int agino_log; /* #bits for agino in inum */
};
-/* Keep iterating the data structure. */
-#define XFS_ITER_CONTINUE (0)
-
-/* Stop iterating the data structure. */
-#define XFS_ITER_ABORT (1)
-
#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 802b34cd10fe..300b3e91ca3a 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -169,6 +169,14 @@ typedef struct xfs_bmbt_irec
xfs_exntst_t br_state; /* extent state */
} xfs_bmbt_irec_t;
+/* per-AG block reservation types */
+enum xfs_ag_resv_type {
+ XFS_AG_RESV_NONE = 0,
+ XFS_AG_RESV_AGFL,
+ XFS_AG_RESV_METADATA,
+ XFS_AG_RESV_RMAPBT,
+};
+
/*
* Type verifier functions
*/
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 16b09b941441..ba0f747c82e8 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -639,7 +639,7 @@ xchk_agfl_block(
xchk_agfl_block_xref(sc, agbno);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return XFS_ITER_ABORT;
+ return -ECANCELED;
return 0;
}
@@ -730,7 +730,7 @@ xchk_agfl(
/* Check the blocks in the AGFL. */
error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
sc->sa.agfl_bp, xchk_agfl_block, &sai);
- if (error == XFS_ITER_ABORT) {
+ if (error == -ECANCELED) {
error = 0;
goto out_free;
}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 1afc58bf71dd..0edc7f8eb96e 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -80,7 +80,7 @@ xchk_setup_xattr(
* without the inode lock held, which means we can sleep.
*/
if (sc->flags & XCHK_TRY_HARDER) {
- error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP);
+ error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0);
if (error)
return error;
}
@@ -163,8 +163,6 @@ xchk_xattr_listent(
args.valuelen = valuelen;
error = xfs_attr_get_ilocked(context->dp, &args);
- if (error == -EEXIST)
- error = 0;
if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
&error))
goto fail_xref;
@@ -173,7 +171,7 @@ xchk_xattr_listent(
args.blkno);
fail_xref:
if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- context->seen_enough = XFS_ITER_ABORT;
+ context->seen_enough = 1;
return;
}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 1bd29fdc2ab5..fa6ea6407992 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -75,6 +75,7 @@ struct xchk_bmap_info {
xfs_fileoff_t lastoff;
bool is_rt;
bool is_shared;
+ bool was_loaded;
int whichfork;
};
@@ -213,25 +214,20 @@ xchk_bmap_xref_rmap(
/* Cross-reference a single rtdev extent record. */
STATIC void
-xchk_bmap_rt_extent_xref(
- struct xchk_bmap_info *info,
+xchk_bmap_rt_iextent_xref(
struct xfs_inode *ip,
- struct xfs_btree_cur *cur,
+ struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
- if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return;
-
xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
irec->br_blockcount);
}
/* Cross-reference a single datadev extent record. */
STATIC void
-xchk_bmap_extent_xref(
- struct xchk_bmap_info *info,
+xchk_bmap_iextent_xref(
struct xfs_inode *ip,
- struct xfs_btree_cur *cur,
+ struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_mount *mp = info->sc->mp;
@@ -240,9 +236,6 @@ xchk_bmap_extent_xref(
xfs_extlen_t len;
int error;
- if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return;
-
agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
len = irec->br_blockcount;
@@ -300,20 +293,15 @@ xchk_bmap_dirattr_extent(
/* Scrub a single extent record. */
STATIC int
-xchk_bmap_extent(
+xchk_bmap_iextent(
struct xfs_inode *ip,
- struct xfs_btree_cur *cur,
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_mount *mp = info->sc->mp;
- struct xfs_buf *bp = NULL;
xfs_filblks_t end;
int error = 0;
- if (cur)
- xfs_btree_get_block(cur, 0, &bp);
-
/*
* Check for out-of-order extents. This record could have come
* from the incore list, for which there is no ordering check.
@@ -364,10 +352,13 @@ xchk_bmap_extent(
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+ if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
if (info->is_rt)
- xchk_bmap_rt_extent_xref(info, ip, cur, irec);
+ xchk_bmap_rt_iextent_xref(ip, info, irec);
else
- xchk_bmap_extent_xref(info, ip, cur, irec);
+ xchk_bmap_iextent_xref(ip, info, irec);
info->lastoff = irec->br_startoff + irec->br_blockcount;
return error;
@@ -380,10 +371,13 @@ xchk_bmapbt_rec(
union xfs_btree_rec *rec)
{
struct xfs_bmbt_irec irec;
+ struct xfs_bmbt_irec iext_irec;
+ struct xfs_iext_cursor icur;
struct xchk_bmap_info *info = bs->private;
struct xfs_inode *ip = bs->cur->bc_private.b.ip;
struct xfs_buf *bp = NULL;
struct xfs_btree_block *block;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork);
uint64_t owner;
int i;
@@ -402,9 +396,26 @@ xchk_bmapbt_rec(
}
}
- /* Set up the in-core record and scrub it. */
+ /*
+ * Check that the incore extent tree contains an extent that matches
+ * this one exactly. We validate those cached bmaps later, so we don't
+ * need to check them here. If the incore extent tree was just loaded
+ * from disk by the scrubber, we assume that its contents match what's
+ * on disk (we still hold the ILOCK) and skip the equivalence check.
+ */
+ if (!info->was_loaded)
+ return 0;
+
xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
- return xchk_bmap_extent(ip, bs->cur, info, &irec);
+ if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
+ &iext_irec) ||
+ irec.br_startoff != iext_irec.br_startoff ||
+ irec.br_startblock != iext_irec.br_startblock ||
+ irec.br_blockcount != iext_irec.br_blockcount ||
+ irec.br_state != iext_irec.br_state)
+ xchk_fblock_set_corrupt(bs->sc, info->whichfork,
+ irec.br_startoff);
+ return 0;
}
/* Scan the btree records. */
@@ -415,15 +426,26 @@ xchk_bmap_btree(
struct xchk_bmap_info *info)
{
struct xfs_owner_info oinfo;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
struct xfs_mount *mp = sc->mp;
struct xfs_inode *ip = sc->ip;
struct xfs_btree_cur *cur;
int error;
+ /* Load the incore bmap cache if it's not loaded. */
+ info->was_loaded = ifp->if_flags & XFS_IFEXTENTS;
+ if (!info->was_loaded) {
+ error = xfs_iread_extents(sc->tp, ip, whichfork);
+ if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
+ goto out;
+ }
+
+ /* Check the btree structure. */
cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info);
xfs_btree_del_cursor(cur, error);
+out:
return error;
}
@@ -500,7 +522,7 @@ xchk_bmap_check_rmap(
out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
return 0;
}
@@ -529,7 +551,7 @@ xchk_bmap_check_ag_rmaps(
sbcri.sc = sc;
sbcri.whichfork = whichfork;
error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error == -ECANCELED)
error = 0;
xfs_btree_del_cursor(cur, error);
@@ -671,13 +693,6 @@ xchk_bmap(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out;
- /* Now try to scrub the in-memory extent list. */
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(sc->tp, ip, whichfork);
- if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
- goto out;
- }
-
/* Find the offset of the last extent in the mapping. */
error = xfs_bmap_last_offset(ip, &endoff, whichfork);
if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
@@ -689,7 +704,7 @@ xchk_bmap(
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
- break;
+ goto out;
if (isnullstartblock(irec.br_startblock))
continue;
if (irec.br_startoff >= endoff) {
@@ -697,7 +712,7 @@ xchk_bmap(
irec.br_startoff);
goto out;
}
- error = xchk_bmap_extent(ip, NULL, &info, &irec);
+ error = xchk_bmap_iextent(ip, &info, &irec);
if (error)
goto out;
}
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 94c4f1de1922..77ff9f97bcda 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -278,7 +278,11 @@ xchk_da_btree_block_check_sibling(
/* Compare upper level pointer to sibling pointer. */
if (ds->state->altpath.blk[level].blkno != sibling)
xchk_da_set_corrupt(ds, level);
- xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp);
+ if (ds->state->altpath.blk[level].bp) {
+ xfs_trans_brelse(ds->dargs.trans,
+ ds->state->altpath.blk[level].bp);
+ ds->state->altpath.blk[level].bp = NULL;
+ }
out:
return error;
}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index fc3f510c9034..98f82d7c8b40 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -125,7 +125,7 @@ xchk_setup_fscounters(
struct xchk_fscounters *fsc;
int error;
- sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP);
+ sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
if (!sc->buf)
return -ENOMEM;
fsc = sc->buf;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 4cfeec57fb05..b70a88bc975e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -351,7 +351,7 @@ xrep_init_btblock(
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
- xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+ xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
bp->b_ops = ops;
*bpp = bp;
@@ -664,7 +664,7 @@ xrep_findroot_agfl_walk(
{
xfs_agblock_t *agbno = priv;
- return (*agbno == bno) ? XFS_ITER_ABORT : 0;
+ return (*agbno == bno) ? -ECANCELED : 0;
}
/* Does this block match the btree information passed in? */
@@ -694,7 +694,7 @@ xrep_findroot_block(
if (owner == XFS_RMAP_OWN_AG) {
error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
xrep_findroot_agfl_walk, &agbno);
- if (error == XFS_ITER_ABORT)
+ if (error == -ECANCELED)
return 0;
if (error)
return error;
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 99c0b1234c3c..5641ae512c9e 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -22,7 +22,7 @@ xchk_setup_symlink(
struct xfs_inode *ip)
{
/* Allocate the buffer without the inode lock held. */
- sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP);
+ sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
if (!sc->buf)
return -ENOMEM;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index cbda40d40326..96d7071cfa46 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -112,7 +112,7 @@ xfs_get_acl(struct inode *inode, int type)
{
struct xfs_inode *ip = XFS_I(inode);
struct posix_acl *acl = NULL;
- struct xfs_acl *xfs_acl;
+ struct xfs_acl *xfs_acl = NULL;
unsigned char *ea_name;
int error;
int len;
@@ -135,12 +135,8 @@ xfs_get_acl(struct inode *inode, int type)
* go out to the disk.
*/
len = XFS_ACL_MAX_SIZE(ip->i_mount);
- xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
- if (!xfs_acl)
- return ERR_PTR(-ENOMEM);
-
- error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
- &len, ATTR_ROOT);
+ error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len,
+ ATTR_ALLOC | ATTR_ROOT);
if (error) {
/*
* If the attribute doesn't exist make sure we have a negative
@@ -151,8 +147,8 @@ xfs_get_acl(struct inode *inode, int type)
} else {
acl = xfs_acl_from_disk(xfs_acl, len,
XFS_ACL_MAX_ENTRIES(ip->i_mount));
+ kmem_free(xfs_acl);
}
- kmem_free(xfs_acl);
return acl;
}
@@ -180,7 +176,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
struct xfs_acl *xfs_acl;
int len = XFS_ACL_MAX_SIZE(ip->i_mount);
- xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+ xfs_acl = kmem_zalloc_large(len, 0);
if (!xfs_acl)
return -ENOMEM;
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index dc93c51c17de..a640a285cc52 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -147,7 +147,7 @@ xfs_attr3_leaf_inactive(
* Allocate storage for a list of all the "remote" value extents.
*/
size = count * sizeof(xfs_attr_inactive_list_t);
- list = kmem_alloc(size, KM_SLEEP);
+ list = kmem_alloc(size, 0);
/*
* Identify each of the "remote" value extents.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 58fc820a70c6..00758fdc2fec 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -109,7 +109,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
* It didn't all fit, so we have to sort everything on hashval.
*/
sbsize = sf->hdr.count * sizeof(*sbuf);
- sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+ sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
/*
* Scan the attribute list for the rest of the entries, storing
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 9fa4a7ee8cfc..83d24e983d4c 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -141,7 +141,7 @@ xfs_bui_init(
{
struct xfs_bui_log_item *buip;
- buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP);
+ buip = kmem_zone_zalloc(xfs_bui_zone, 0);
xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -218,7 +218,7 @@ xfs_trans_get_bud(
{
struct xfs_bud_log_item *budp;
- budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
+ budp = kmem_zone_zalloc(xfs_bud_zone, 0);
xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
&xfs_bud_item_ops);
budp->bud_buip = buip;
@@ -542,9 +542,7 @@ xfs_bui_recover(
irec.br_blockcount = count;
irec.br_startoff = bmap->me_startoff;
irec.br_state = state;
- error = xfs_bmap_unmap_extent(tp, ip, &irec);
- if (error)
- goto err_inode;
+ xfs_bmap_unmap_extent(tp, ip, &irec);
}
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 98c6a7a71427..0910cb75b65d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -39,9 +39,9 @@
xfs_daddr_t
xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
{
- return (XFS_IS_REALTIME_INODE(ip) ? \
- (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
- XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+ if (XFS_IS_REALTIME_INODE(ip))
+ return XFS_FSB_TO_BB(ip->i_mount, fsb);
+ return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
}
/*
@@ -1532,24 +1532,16 @@ xfs_swap_extent_rmap(
trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
/* Remove the mapping from the donor file. */
- error = xfs_bmap_unmap_extent(tp, tip, &uirec);
- if (error)
- goto out;
+ xfs_bmap_unmap_extent(tp, tip, &uirec);
/* Remove the mapping from the source file. */
- error = xfs_bmap_unmap_extent(tp, ip, &irec);
- if (error)
- goto out;
+ xfs_bmap_unmap_extent(tp, ip, &irec);
/* Map the donor file's blocks into the source file. */
- error = xfs_bmap_map_extent(tp, ip, &uirec);
- if (error)
- goto out;
+ xfs_bmap_map_extent(tp, ip, &uirec);
/* Map the source file's blocks into the donor file. */
- error = xfs_bmap_map_extent(tp, tip, &irec);
- if (error)
- goto out;
+ xfs_bmap_map_extent(tp, tip, &irec);
error = xfs_defer_finish(tpp);
tp = *tpp;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ca0849043f54..120ef99d09e8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -353,7 +353,8 @@ xfs_buf_allocate_memory(
*/
size = BBTOB(bp->b_length);
if (size < PAGE_SIZE) {
- bp->b_addr = kmem_alloc(size, KM_NOFS);
+ int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
+ bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS);
if (!bp->b_addr) {
/* low memory - use alloc_page loop instead */
goto use_alloc_page;
@@ -368,7 +369,7 @@ xfs_buf_allocate_memory(
}
bp->b_offset = offset_in_page(bp->b_addr);
bp->b_pages = bp->b_page_array;
- bp->b_pages[0] = virt_to_page(bp->b_addr);
+ bp->b_pages[0] = kmem_to_page(bp->b_addr);
bp->b_page_count = 1;
bp->b_flags |= _XBF_KMEM;
return 0;
@@ -1741,7 +1742,7 @@ xfs_alloc_buftarg(
{
xfs_buftarg_t *btp;
- btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
+ btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c6e57a3f409e..f6ce17d8d848 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -350,6 +350,12 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
+static inline int
+xfs_buftarg_dma_alignment(struct xfs_buftarg *bt)
+{
+ return queue_dma_alignment(bt->bt_bdev->bd_disk->queue);
+}
+
int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7dcaec54a20b..d74fbd1e9d3e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -702,7 +702,7 @@ xfs_buf_item_get_format(
}
bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
- KM_SLEEP);
+ 0);
if (!bip->bli_formats)
return -ENOMEM;
return 0;
@@ -747,7 +747,7 @@ xfs_buf_item_init(
return 0;
}
- bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
+ bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index fb1ad4483081..aeb95e7391c1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -440,7 +440,7 @@ xfs_dquot_alloc(
{
struct xfs_dquot *dqp;
- dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
+ dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
dqp->dq_flags = type;
dqp->q_core.d_id = cpu_to_be32(id);
@@ -1239,7 +1239,7 @@ xfs_qm_exit(void)
/*
* Iterate every dquot of a particular type. The caller must ensure that the
* particular quota type is active. iter_fn can return negative error codes,
- * or XFS_ITER_ABORT to indicate that it wants to stop iterating.
+ * or -ECANCELED to indicate that it wants to stop iterating.
*/
int
xfs_qm_dqiterate(
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 282ec5af293e..d60647d7197b 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -347,7 +347,7 @@ xfs_qm_qoff_logitem_init(
{
struct xfs_qoff_logitem *qf;
- qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+ qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0);
xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 544c9482a0ef..849fd4476950 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -213,7 +213,7 @@ xfs_errortag_init(
struct xfs_mount *mp)
{
mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
- KM_SLEEP | KM_MAYFAIL);
+ KM_MAYFAIL);
if (!mp->m_errortag)
return -ENOMEM;
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 0ed68379e551..2183d87be4cf 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -33,7 +33,7 @@ xfs_extent_busy_insert(
struct rb_node **rbp;
struct rb_node *parent = NULL;
- new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
+ new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
new->agno = agno;
new->bno = bno;
new->length = len;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 86f6512d6864..e44efc41a041 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -163,9 +163,9 @@ xfs_efi_init(
if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
size = (uint)(sizeof(xfs_efi_log_item_t) +
((nextents - 1) * sizeof(xfs_extent_t)));
- efip = kmem_zalloc(size, KM_SLEEP);
+ efip = kmem_zalloc(size, 0);
} else {
- efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
+ efip = kmem_zone_zalloc(xfs_efi_zone, 0);
}
xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -333,9 +333,9 @@ xfs_trans_get_efd(
if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
(nextents - 1) * sizeof(struct xfs_extent),
- KM_SLEEP);
+ 0);
} else {
- efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
+ efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
}
xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 21bd3d575aaa..1ffb179f35d2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -28,6 +28,7 @@
#include <linux/falloc.h>
#include <linux/backing-dev.h>
#include <linux/mman.h>
+#include <linux/fadvise.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -939,6 +940,30 @@ out_unlock:
return error;
}
+STATIC int
+xfs_file_fadvise(
+ struct file *file,
+ loff_t start,
+ loff_t end,
+ int advice)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ int ret;
+ int lockflags = 0;
+
+ /*
+ * Operations creating pages in page cache need protection from hole
+ * punching and similar ops
+ */
+ if (advice == POSIX_FADV_WILLNEED) {
+ lockflags = XFS_IOLOCK_SHARED;
+ xfs_ilock(ip, lockflags);
+ }
+ ret = generic_fadvise(file, start, end, advice);
+ if (lockflags)
+ xfs_iunlock(ip, lockflags);
+ return ret;
+}
STATIC loff_t
xfs_file_remap_range(
@@ -1238,6 +1263,7 @@ const struct file_operations xfs_file_operations = {
.fsync = xfs_file_fsync,
.get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
+ .fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
};
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 5a8f9641562a..d082143feb5a 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -250,7 +250,7 @@ xfs_getfsmap_helper(
rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
if (info->next_daddr < rec_daddr)
info->next_daddr = rec_daddr;
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
}
/* Are we just counting mappings? */
@@ -259,14 +259,14 @@ xfs_getfsmap_helper(
info->head->fmh_entries++;
if (info->last)
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
info->head->fmh_entries++;
rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
if (info->next_daddr < rec_daddr)
info->next_daddr = rec_daddr;
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
}
/*
@@ -276,7 +276,7 @@ xfs_getfsmap_helper(
*/
if (rec_daddr > info->next_daddr) {
if (info->head->fmh_entries >= info->head->fmh_count)
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
fmr.fmr_device = info->dev;
fmr.fmr_physical = info->next_daddr;
@@ -295,7 +295,7 @@ xfs_getfsmap_helper(
/* Fill out the extent we found */
if (info->head->fmh_entries >= info->head->fmh_count)
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
@@ -328,7 +328,7 @@ out:
rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
if (info->next_daddr < rec_daddr)
info->next_daddr = rec_daddr;
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
}
/* Transform a rmapbt irec into a fsmap */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0b0fd10a36d4..944add5ff8e0 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -40,7 +40,7 @@ xfs_inode_alloc(
* KM_MAYFAIL and return NULL here on ENOMEM. Set the
* code up to do this anyway.
*/
- ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+ ip = kmem_zone_alloc(xfs_inode_zone, 0);
if (!ip)
return NULL;
if (inode_init_always(mp->m_super, VFS_I(ip))) {
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d99a0a3e5f40..3ebd1b7f49d8 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -89,7 +89,7 @@ xfs_icreate_log(
{
struct xfs_icreate_item *icp;
- icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+ icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
&xfs_icreate_item_ops);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6467d5e1df2d..18f4b262e61c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2018,7 +2018,7 @@ xfs_iunlink_add_backref(
if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
return 0;
- iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+ iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
iu->iu_agino = prev_agino;
iu->iu_next_unlinked = this_agino;
@@ -3282,7 +3282,8 @@ xfs_rename(
spaceres);
/*
- * Set up the target.
+ * Check for expected errors before we dirty the transaction
+ * so we can return an error without a transaction abort.
*/
if (target_ip == NULL) {
/*
@@ -3294,6 +3295,46 @@ xfs_rename(
if (error)
goto out_trans_cancel;
}
+ } else {
+ /*
+ * If target exists and it's a directory, check that whether
+ * it can be destroyed.
+ */
+ if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
+ (!xfs_dir_isempty(target_ip) ||
+ (VFS_I(target_ip)->i_nlink > 2))) {
+ error = -EEXIST;
+ goto out_trans_cancel;
+ }
+ }
+
+ /*
+ * Directory entry creation below may acquire the AGF. Remove
+ * the whiteout from the unlinked list first to preserve correct
+ * AGI/AGF locking order. This dirties the transaction so failures
+ * after this point will abort and log recovery will clean up the
+ * mess.
+ *
+ * For whiteouts, we need to bump the link count on the whiteout
+ * inode. After this point, we have a real link, clear the tmpfile
+ * state flag from the inode so it doesn't accidentally get misused
+ * in future.
+ */
+ if (wip) {
+ ASSERT(VFS_I(wip)->i_nlink == 0);
+ error = xfs_iunlink_remove(tp, wip);
+ if (error)
+ goto out_trans_cancel;
+
+ xfs_bumplink(tp, wip);
+ xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
+ VFS_I(wip)->i_state &= ~I_LINKABLE;
+ }
+
+ /*
+ * Set up the target.
+ */
+ if (target_ip == NULL) {
/*
* If target does not exist and the rename crosses
* directories, adjust the target directory link count
@@ -3312,22 +3353,6 @@ xfs_rename(
}
} else { /* target_ip != NULL */
/*
- * If target exists and it's a directory, check that both
- * target and source are directories and that target can be
- * destroyed, or that neither is a directory.
- */
- if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
- /*
- * Make sure target dir is empty.
- */
- if (!(xfs_dir_isempty(target_ip)) ||
- (VFS_I(target_ip)->i_nlink > 2)) {
- error = -EEXIST;
- goto out_trans_cancel;
- }
- }
-
- /*
* Link the source inode under the target name.
* If the source inode is a directory and we are moving
* it across directories, its ".." entry will be
@@ -3417,30 +3442,6 @@ xfs_rename(
if (error)
goto out_trans_cancel;
- /*
- * For whiteouts, we need to bump the link count on the whiteout inode.
- * This means that failures all the way up to this point leave the inode
- * on the unlinked list and so cleanup is a simple matter of dropping
- * the remaining reference to it. If we fail here after bumping the link
- * count, we're shutting down the filesystem so we'll never see the
- * intermediate state on disk.
- */
- if (wip) {
- ASSERT(VFS_I(wip)->i_nlink == 0);
- xfs_bumplink(tp, wip);
- error = xfs_iunlink_remove(tp, wip);
- if (error)
- goto out_trans_cancel;
- xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
-
- /*
- * Now we have a real link, clear the "I'm a tmpfile" state
- * flag from the inode so it doesn't accidentally get misused in
- * future.
- */
- VFS_I(wip)->i_state &= ~I_LINKABLE;
- }
-
xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
if (new_parent)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c9a502eed204..bb8f076805b9 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,7 +651,7 @@ xfs_inode_item_init(
struct xfs_inode_log_item *iip;
ASSERT(ip->i_itemp == NULL);
- iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+ iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
iip->ili_inode = ip;
xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6f7848cd5527..d58f0d6a699e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -67,7 +67,7 @@ xfs_find_handle(
return -EBADF;
inode = file_inode(f.file);
} else {
- error = user_lpath((const char __user *)hreq->path, &path);
+ error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
if (error)
return error;
inode = d_inode(path.dentry);
@@ -396,7 +396,7 @@ xfs_attrlist_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+ kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
if (!kbuf)
goto out_dput;
@@ -434,11 +434,11 @@ xfs_attrmulti_attr_get(
if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
- kbuf = kmem_zalloc_large(*len, KM_SLEEP);
+ kbuf = kmem_zalloc_large(*len, 0);
if (!kbuf)
return -ENOMEM;
- error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
+ error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags);
if (error)
goto out_kfree;
@@ -831,7 +831,7 @@ xfs_bulkstat_fmt(
/*
* Check the incoming bulk request @hdr from userspace and initialize the
* internal @breq bulk request appropriately. Returns 0 if the bulk request
- * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual
+ * should proceed; -ECANCELED if there's nothing to do; or the usual
* negative error code.
*/
static int
@@ -889,13 +889,13 @@ xfs_bulk_ireq_setup(
/* Asking for an inode past the end of the AG? We're done! */
if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
- return XFS_ITER_ABORT;
+ return -ECANCELED;
} else if (hdr->agno)
return -EINVAL;
/* Asking for an inode past the end of the FS? We're done! */
if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
- return XFS_ITER_ABORT;
+ return -ECANCELED;
return 0;
}
@@ -936,7 +936,7 @@ xfs_ioc_bulkstat(
return -EFAULT;
error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat);
- if (error == XFS_ITER_ABORT)
+ if (error == -ECANCELED)
goto out_teardown;
if (error < 0)
return error;
@@ -986,7 +986,7 @@ xfs_ioc_inumbers(
return -EFAULT;
error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
- if (error == XFS_ITER_ABORT)
+ if (error == -ECANCELED)
goto out_teardown;
if (error < 0)
return error;
@@ -1038,6 +1038,10 @@ xfs_ioc_ag_geometry(
if (copy_from_user(&ageo, arg, sizeof(ageo)))
return -EFAULT;
+ if (ageo.ag_flags)
+ return -EINVAL;
+ if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
+ return -EINVAL;
error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
if (error)
@@ -1309,8 +1313,7 @@ xfs_ioctl_setattr_dax_invalidate(
if (fa->fsx_xflags & FS_XFLAG_DAX) {
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
return -EINVAL;
- if (S_ISREG(inode->i_mode) &&
- !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+ if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
sb->s_blocksize))
return -EINVAL;
}
@@ -1881,7 +1884,7 @@ xfs_ioc_getfsmap(
info.mp = ip->i_mount;
info.data = arg;
error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ if (error == -ECANCELED) {
error = 0;
aborted = true;
} else if (error)
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 7fcf7569743f..1e08bf79b478 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -381,7 +381,7 @@ xfs_compat_attrlist_by_handle(
return PTR_ERR(dentry);
error = -ENOMEM;
- kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+ kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
if (!kbuf)
goto out_dput;
@@ -547,63 +547,12 @@ xfs_file_compat_ioctl(
struct inode *inode = file_inode(filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- void __user *arg = (void __user *)p;
+ void __user *arg = compat_ptr(p);
int error;
trace_xfs_file_compat_ioctl(ip);
switch (cmd) {
- /* No size or alignment issues on any arch */
- case XFS_IOC_DIOINFO:
- case XFS_IOC_FSGEOMETRY_V4:
- case XFS_IOC_FSGEOMETRY:
- case XFS_IOC_AG_GEOMETRY:
- case XFS_IOC_FSGETXATTR:
- case XFS_IOC_FSSETXATTR:
- case XFS_IOC_FSGETXATTRA:
- case XFS_IOC_FSSETDM:
- case XFS_IOC_GETBMAP:
- case XFS_IOC_GETBMAPA:
- case XFS_IOC_GETBMAPX:
- case XFS_IOC_FSCOUNTS:
- case XFS_IOC_SET_RESBLKS:
- case XFS_IOC_GET_RESBLKS:
- case XFS_IOC_FSGROWFSLOG:
- case XFS_IOC_GOINGDOWN:
- case XFS_IOC_ERROR_INJECTION:
- case XFS_IOC_ERROR_CLEARALL:
- case FS_IOC_GETFSMAP:
- case XFS_IOC_SCRUB_METADATA:
- case XFS_IOC_BULKSTAT:
- case XFS_IOC_INUMBERS:
- return xfs_file_ioctl(filp, cmd, p);
-#if !defined(BROKEN_X86_ALIGNMENT) || defined(CONFIG_X86_X32)
- /*
- * These are handled fine if no alignment issues. To support x32
- * which uses native 64-bit alignment we must emit these cases in
- * addition to the ia-32 compat set below.
- */
- case XFS_IOC_ALLOCSP:
- case XFS_IOC_FREESP:
- case XFS_IOC_RESVSP:
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP64:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP64:
- case XFS_IOC_FSGEOMETRY_V1:
- case XFS_IOC_FSGROWFSDATA:
- case XFS_IOC_FSGROWFSRT:
- case XFS_IOC_ZERO_RANGE:
-#ifdef CONFIG_X86_X32
- /*
- * x32 special: this gets a different cmd number from the ia-32 compat
- * case below; the associated data will match native 64-bit alignment.
- */
- case XFS_IOC_SWAPEXT:
-#endif
- return xfs_file_ioctl(filp, cmd, p);
-#endif
#if defined(BROKEN_X86_ALIGNMENT)
case XFS_IOC_ALLOCSP_32:
case XFS_IOC_FREESP_32:
@@ -705,6 +654,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_FSSETDM_BY_HANDLE_32:
return xfs_compat_fssetdm_by_handle(filp, arg);
default:
- return -ENOIOCTLCMD;
+ /* try the native version */
+ return xfs_file_ioctl(filp, cmd, (unsigned long)arg);
}
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3a4310d7cb59..f780e223b118 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -58,7 +58,7 @@ xfs_bmbt_to_iomap(
{
struct xfs_mount *mp = ip->i_mount;
- if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
return xfs_alert_fsblock_zero(ip, imap);
if (imap->br_startblock == HOLESTARTBLOCK) {
@@ -297,7 +297,7 @@ xfs_iomap_write_direct(
goto out_unlock;
}
- if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+ if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
@@ -814,7 +814,7 @@ xfs_iomap_write_unwritten(
if (error)
return error;
- if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+ if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
return xfs_alert_fsblock_zero(ip, &imap);
if ((numblks_fsb = imap.br_blockcount) == 0) {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ff3c1fae5357..fe285d123d69 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -793,6 +793,7 @@ xfs_setattr_nonsize(
out_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index a8a06bb78ea8..884950adbd16 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -137,7 +137,7 @@ xfs_bulkstat_one_int(
xfs_irele(ip);
error = bc->formatter(bc->breq, buf);
- if (error == XFS_IBULK_ABORT)
+ if (error == -ECANCELED)
goto out_advance;
if (error)
goto out;
@@ -169,7 +169,7 @@ xfs_bulkstat_one(
ASSERT(breq->icount == 1);
bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
- KM_SLEEP | KM_MAYFAIL);
+ KM_MAYFAIL);
if (!bc.buf)
return -ENOMEM;
@@ -181,7 +181,7 @@ xfs_bulkstat_one(
* If we reported one inode to userspace then we abort because we hit
* the end of the buffer. Don't leak that back to userspace.
*/
- if (error == XFS_IWALK_ABORT)
+ if (error == -ECANCELED)
error = 0;
return error;
@@ -243,7 +243,7 @@ xfs_bulkstat(
return 0;
bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
- KM_SLEEP | KM_MAYFAIL);
+ KM_MAYFAIL);
if (!bc.buf)
return -ENOMEM;
@@ -272,6 +272,7 @@ xfs_bulkstat_to_bstat(
struct xfs_bstat *bs1,
const struct xfs_bulkstat *bstat)
{
+ /* memset is needed here because of padding holes in the structure. */
memset(bs1, 0, sizeof(struct xfs_bstat));
bs1->bs_ino = bstat->bs_ino;
bs1->bs_mode = bstat->bs_mode;
@@ -341,7 +342,7 @@ xfs_inumbers_walk(
int error;
error = ic->formatter(ic->breq, &inogrp);
- if (error && error != XFS_IBULK_ABORT)
+ if (error && error != -ECANCELED)
return error;
ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +
@@ -388,6 +389,8 @@ xfs_inumbers_to_inogrp(
struct xfs_inogrp *ig1,
const struct xfs_inumbers *ig)
{
+ /* memset is needed here because of padding holes in the structure. */
+ memset(ig1, 0, sizeof(struct xfs_inogrp));
ig1->xi_startino = ig->xi_startino;
ig1->xi_alloccount = ig->xi_alloccount;
ig1->xi_allocmask = ig->xi_allocmask;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index e90c1fc5b981..96a1e2a9be3f 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -18,9 +18,6 @@ struct xfs_ibulk {
/* Only iterate within the same AG as startino */
#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
-/* Return value that means we want to abort the walk. */
-#define XFS_IBULK_ABORT (XFS_IWALK_ABORT)
-
/*
* Advance the user buffer pointer by one record of the given size. If the
* buffer is now full, return the appropriate error code.
@@ -34,13 +31,21 @@ xfs_ibulk_advance(
breq->ubuffer = b + bytes;
breq->ocount++;
- return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0;
+ return breq->ocount == breq->icount ? -ECANCELED : 0;
}
/*
* Return stat information in bulk (by-inode) for the filesystem.
*/
+/*
+ * Return codes for the formatter function are 0 to continue iterating, and
+ * non-zero to stop iterating. Any non-zero value will be passed up to the
+ * bulkstat/inumbers caller. The special value -ECANCELED can be used to stop
+ * iteration, as neither bulkstat nor inumbers will ever generate that error
+ * code on their own.
+ */
+
typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
const struct xfs_bulkstat *bstat);
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 8c7d727149ea..aa375cf53021 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -31,7 +31,7 @@
* inode it finds, it calls a walk function with the relevant inode number and
* a pointer to caller-provided data. The walk function can return the usual
* negative error code to stop the iteration; 0 to continue the iteration; or
- * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the
+ * -ECANCELED to stop the iteration. This return value is returned to the
* caller.
*
* Internally, we allow the walk function to do anything, which means that we
@@ -616,7 +616,7 @@ xfs_iwalk_threaded(
if (xfs_pwork_ctl_want_abort(&pctl))
break;
- iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
+ iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
iwag->mp = mp;
iwag->iwalk_fn = iwalk_fn;
iwag->data = data;
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 6c960e10ed4d..37a795f03267 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -6,12 +6,17 @@
#ifndef __XFS_IWALK_H__
#define __XFS_IWALK_H__
+/*
+ * Return codes for the inode/inobt walk function are 0 to continue iterating,
+ * and non-zero to stop iterating. Any non-zero value will be passed up to the
+ * iwalk or inobt_walk caller. The special value -ECANCELED can be used to
+ * stop iteration, as neither iwalk nor inobt_walk will ever generate that
+ * error code on their own.
+ */
+
/* Walk all inodes in the filesystem starting from @startino. */
typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, void *data);
-/* Return values for xfs_iwalk_fn. */
-#define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE)
-#define XFS_IWALK_ABORT (XFS_ITER_ABORT)
int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
unsigned int flags, xfs_iwalk_fn iwalk_fn,
@@ -30,8 +35,6 @@ typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno,
const struct xfs_inobt_rec_incore *irec,
void *data);
-/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */
-#define XFS_INOBT_WALK_ABORT (XFS_IWALK_ABORT)
int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t startino, unsigned int flags,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 00e9f5c388d3..a2beee9f74da 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -214,15 +214,42 @@ xlog_grant_head_wake(
{
struct xlog_ticket *tic;
int need_bytes;
+ bool woken_task = false;
list_for_each_entry(tic, &head->waiters, t_queue) {
+
+ /*
+ * There is a chance that the size of the CIL checkpoints in
+ * progress at the last AIL push target calculation resulted in
+ * limiting the target to the log head (l_last_sync_lsn) at the
+ * time. This may not reflect where the log head is now as the
+ * CIL checkpoints may have completed.
+ *
+ * Hence when we are woken here, it may be that the head of the
+ * log that has moved rather than the tail. As the tail didn't
+ * move, there still won't be space available for the
+ * reservation we require. However, if the AIL has already
+ * pushed to the target defined by the old log head location, we
+ * will hang here waiting for something else to update the AIL
+ * push target.
+ *
+ * Therefore, if there isn't space to wake the first waiter on
+ * the grant head, we need to push the AIL again to ensure the
+ * target reflects both the current log tail and log head
+ * position before we wait for the tail to move again.
+ */
+
need_bytes = xlog_ticket_reservation(log, head, tic);
- if (*free_bytes < need_bytes)
+ if (*free_bytes < need_bytes) {
+ if (!woken_task)
+ xlog_grant_push_ail(log, need_bytes);
return false;
+ }
*free_bytes -= need_bytes;
trace_xfs_log_grant_wake_up(log, tic);
wake_up_process(tic->t_task);
+ woken_task = true;
}
return true;
@@ -428,11 +455,7 @@ xfs_log_reserve(
XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
- KM_SLEEP | KM_MAYFAIL);
- if (!tic)
- return -ENOMEM;
-
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -1407,6 +1430,7 @@ xlog_alloc_log(
*/
ASSERT(log->l_iclog_size >= 4096);
for (i = 0; i < log->l_iclog_bufs; i++) {
+ int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
sizeof(struct bio_vec);
@@ -1418,8 +1442,8 @@ xlog_alloc_log(
iclog->ic_prev = prev_iclog;
prev_iclog = iclog;
- iclog->ic_data = kmem_alloc_large(log->l_iclog_size,
- KM_MAYFAIL);
+ iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
+ KM_MAYFAIL);
if (!iclog->ic_data)
goto out_free_iclog;
#ifdef DEBUG
@@ -2499,21 +2523,35 @@ next_lv:
*****************************************************************************
*/
-/* Clean iclogs starting from the head. This ordering must be
- * maintained, so an iclog doesn't become ACTIVE beyond one that
- * is SYNCING. This is also required to maintain the notion that we use
- * a ordered wait queue to hold off would be writers to the log when every
- * iclog is trying to sync to disk.
+/*
+ * An iclog has just finished IO completion processing, so we need to update
+ * the iclog state and propagate that up into the overall log state. Hence we
+ * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
+ * starting from the head, and then wake up any threads that are waiting for the
+ * iclog to be marked clean.
*
- * State Change: DIRTY -> ACTIVE
+ * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
+ * doesn't become ACTIVE beyond one that is SYNCING. This is also required to
+ * maintain the notion that we use a ordered wait queue to hold off would be
+ * writers to the log when every iclog is trying to sync to disk.
+ *
+ * Caller must hold the icloglock before calling us.
+ *
+ * State Change: !IOERROR -> DIRTY -> ACTIVE
*/
STATIC void
-xlog_state_clean_log(
- struct xlog *log)
+xlog_state_clean_iclog(
+ struct xlog *log,
+ struct xlog_in_core *dirty_iclog)
{
- xlog_in_core_t *iclog;
- int changed = 0;
+ struct xlog_in_core *iclog;
+ int changed = 0;
+ /* Prepare the completed iclog. */
+ if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR))
+ dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+
+ /* Walk all the iclogs to update the ordered active state. */
iclog = log->l_iclog;
do {
if (iclog->ic_state == XLOG_STATE_DIRTY) {
@@ -2551,7 +2589,13 @@ xlog_state_clean_log(
iclog = iclog->ic_next;
} while (iclog != log->l_iclog);
- /* log is locked when we are called */
+
+ /*
+ * Wake up threads waiting in xfs_log_force() for the dirty iclog
+ * to be cleaned.
+ */
+ wake_up_all(&dirty_iclog->ic_force_wait);
+
/*
* Change state for the dummy log recording.
* We usually go to NEED. But we go to NEED2 if the changed indicates
@@ -2585,7 +2629,7 @@ xlog_state_clean_log(
ASSERT(0);
}
}
-} /* xlog_state_clean_log */
+}
STATIC xfs_lsn_t
xlog_get_lowest_lsn(
@@ -2606,30 +2650,205 @@ xlog_get_lowest_lsn(
return lowest_lsn;
}
+/*
+ * Completion of a iclog IO does not imply that a transaction has completed, as
+ * transactions can be large enough to span many iclogs. We cannot change the
+ * tail of the log half way through a transaction as this may be the only
+ * transaction in the log and moving the tail to point to the middle of it
+ * will prevent recovery from finding the start of the transaction. Hence we
+ * should only update the last_sync_lsn if this iclog contains transaction
+ * completion callbacks on it.
+ *
+ * We have to do this before we drop the icloglock to ensure we are the only one
+ * that can update it.
+ *
+ * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
+ * the reservation grant head pushing. This is due to the fact that the push
+ * target is bound by the current last_sync_lsn value. Hence if we have a large
+ * amount of log space bound up in this committing transaction then the
+ * last_sync_lsn value may be the limiting factor preventing tail pushing from
+ * freeing space in the log. Hence once we've updated the last_sync_lsn we
+ * should push the AIL to ensure the push target (and hence the grant head) is
+ * no longer bound by the old log head location and can move forwards and make
+ * progress again.
+ */
+static void
+xlog_state_set_callback(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ xfs_lsn_t header_lsn)
+{
+ iclog->ic_state = XLOG_STATE_CALLBACK;
+
+ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+ header_lsn) <= 0);
+
+ if (list_empty_careful(&iclog->ic_callbacks))
+ return;
+
+ atomic64_set(&log->l_last_sync_lsn, header_lsn);
+ xlog_grant_push_ail(log, 0);
+}
+
+/*
+ * Return true if we need to stop processing, false to continue to the next
+ * iclog. The caller will need to run callbacks if the iclog is returned in the
+ * XLOG_STATE_CALLBACK state.
+ */
+static bool
+xlog_state_iodone_process_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ struct xlog_in_core *completed_iclog,
+ bool *ioerror)
+{
+ xfs_lsn_t lowest_lsn;
+ xfs_lsn_t header_lsn;
+
+ /* Skip all iclogs in the ACTIVE & DIRTY states */
+ if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+ return false;
+
+ /*
+ * Between marking a filesystem SHUTDOWN and stopping the log, we do
+ * flush all iclogs to disk (if there wasn't a log I/O error). So, we do
+ * want things to go smoothly in case of just a SHUTDOWN w/o a
+ * LOG_IO_ERROR.
+ */
+ if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ *ioerror = true;
+ return false;
+ }
+
+ /*
+ * Can only perform callbacks in order. Since this iclog is not in the
+ * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean
+ * up. If we set our iclog to DO_CALLBACK, we will not process it when
+ * we retry since a previous iclog is in the CALLBACK and the state
+ * cannot change since we are holding the l_icloglock.
+ */
+ if (!(iclog->ic_state &
+ (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) {
+ if (completed_iclog &&
+ (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) {
+ completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK;
+ }
+ return true;
+ }
+
+ /*
+ * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC
+ * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught
+ * by the above if and are going to clean (i.e. we aren't doing their
+ * callbacks) see the above if.
+ *
+ * We will do one more check here to see if we have chased our tail
+ * around. If this is not the lowest lsn iclog, then we will leave it
+ * for another completion to process.
+ */
+ header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ lowest_lsn = xlog_get_lowest_lsn(log);
+ if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
+ return false;
+
+ xlog_state_set_callback(log, iclog, header_lsn);
+ return false;
+
+}
+
+/*
+ * Keep processing entries in the iclog callback list until we come around and
+ * it is empty. We need to atomically see that the list is empty and change the
+ * state to DIRTY so that we don't miss any more callbacks being added.
+ *
+ * This function is called with the icloglock held and returns with it held. We
+ * drop it while running callbacks, however, as holding it over thousands of
+ * callbacks is unnecessary and causes excessive contention if we do.
+ */
+static void
+xlog_state_do_iclog_callbacks(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ bool aborted)
+{
+ spin_unlock(&log->l_icloglock);
+ spin_lock(&iclog->ic_callback_lock);
+ while (!list_empty(&iclog->ic_callbacks)) {
+ LIST_HEAD(tmp);
+
+ list_splice_init(&iclog->ic_callbacks, &tmp);
+
+ spin_unlock(&iclog->ic_callback_lock);
+ xlog_cil_process_committed(&tmp, aborted);
+ spin_lock(&iclog->ic_callback_lock);
+ }
+
+ /*
+ * Pick up the icloglock while still holding the callback lock so we
+ * serialise against anyone trying to add more callbacks to this iclog
+ * now we've finished processing.
+ */
+ spin_lock(&log->l_icloglock);
+ spin_unlock(&iclog->ic_callback_lock);
+}
+
+#ifdef DEBUG
+/*
+ * Make one last gasp attempt to see if iclogs are being left in limbo. If the
+ * above loop finds an iclog earlier than the current iclog and in one of the
+ * syncing states, the current iclog is put into DO_CALLBACK and the callbacks
+ * are deferred to the completion of the earlier iclog. Walk the iclogs in order
+ * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in
+ * one of the syncing states.
+ *
+ * Note that SYNCING|IOERROR is a valid state so we cannot just check for
+ * ic_state == SYNCING.
+ */
+static void
+xlog_state_callback_check_state(
+ struct xlog *log)
+{
+ struct xlog_in_core *first_iclog = log->l_iclog;
+ struct xlog_in_core *iclog = first_iclog;
+
+ do {
+ ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+ /*
+ * Terminate the loop if iclogs are found in states
+ * which will cause other threads to clean up iclogs.
+ *
+ * SYNCING - i/o completion will go through logs
+ * DONE_SYNC - interrupt thread should be waiting for
+ * l_icloglock
+ * IOERROR - give up hope all ye who enter here
+ */
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state & XLOG_STATE_SYNCING ||
+ iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR )
+ break;
+ iclog = iclog->ic_next;
+ } while (first_iclog != iclog);
+}
+#else
+#define xlog_state_callback_check_state(l) ((void)0)
+#endif
+
STATIC void
xlog_state_do_callback(
struct xlog *log,
bool aborted,
struct xlog_in_core *ciclog)
{
- xlog_in_core_t *iclog;
- xlog_in_core_t *first_iclog; /* used to know when we've
- * processed all iclogs once */
- int flushcnt = 0;
- xfs_lsn_t lowest_lsn;
- int ioerrors; /* counter: iclogs with errors */
- int loopdidcallbacks; /* flag: inner loop did callbacks*/
- int funcdidcallbacks; /* flag: function did callbacks */
- int repeats; /* for issuing console warnings if
- * looping too many times */
- int wake = 0;
+ struct xlog_in_core *iclog;
+ struct xlog_in_core *first_iclog;
+ bool did_callbacks = false;
+ bool cycled_icloglock;
+ bool ioerror;
+ int flushcnt = 0;
+ int repeats = 0;
spin_lock(&log->l_icloglock);
- first_iclog = iclog = log->l_iclog;
- ioerrors = 0;
- funcdidcallbacks = 0;
- repeats = 0;
-
do {
/*
* Scan all iclogs starting with the one pointed to by the
@@ -2641,137 +2860,34 @@ xlog_state_do_callback(
*/
first_iclog = log->l_iclog;
iclog = log->l_iclog;
- loopdidcallbacks = 0;
+ cycled_icloglock = false;
+ ioerror = false;
repeats++;
do {
+ if (xlog_state_iodone_process_iclog(log, iclog,
+ ciclog, &ioerror))
+ break;
- /* skip all iclogs in the ACTIVE & DIRTY states */
- if (iclog->ic_state &
- (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+ if (!(iclog->ic_state &
+ (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) {
iclog = iclog->ic_next;
continue;
}
/*
- * Between marking a filesystem SHUTDOWN and stopping
- * the log, we do flush all iclogs to disk (if there
- * wasn't a log I/O error). So, we do want things to
- * go smoothly in case of just a SHUTDOWN w/o a
- * LOG_IO_ERROR.
+ * Running callbacks will drop the icloglock which means
+ * we'll have to run at least one more complete loop.
*/
- if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
- /*
- * Can only perform callbacks in order. Since
- * this iclog is not in the DONE_SYNC/
- * DO_CALLBACK state, we skip the rest and
- * just try to clean up. If we set our iclog
- * to DO_CALLBACK, we will not process it when
- * we retry since a previous iclog is in the
- * CALLBACK and the state cannot change since
- * we are holding the l_icloglock.
- */
- if (!(iclog->ic_state &
- (XLOG_STATE_DONE_SYNC |
- XLOG_STATE_DO_CALLBACK))) {
- if (ciclog && (ciclog->ic_state ==
- XLOG_STATE_DONE_SYNC)) {
- ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
- }
- break;
- }
- /*
- * We now have an iclog that is in either the
- * DO_CALLBACK or DONE_SYNC states. The other
- * states (WANT_SYNC, SYNCING, or CALLBACK were
- * caught by the above if and are going to
- * clean (i.e. we aren't doing their callbacks)
- * see the above if.
- */
-
- /*
- * We will do one more check here to see if we
- * have chased our tail around.
- */
-
- lowest_lsn = xlog_get_lowest_lsn(log);
- if (lowest_lsn &&
- XFS_LSN_CMP(lowest_lsn,
- be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
- iclog = iclog->ic_next;
- continue; /* Leave this iclog for
- * another thread */
- }
-
- iclog->ic_state = XLOG_STATE_CALLBACK;
-
-
- /*
- * Completion of a iclog IO does not imply that
- * a transaction has completed, as transactions
- * can be large enough to span many iclogs. We
- * cannot change the tail of the log half way
- * through a transaction as this may be the only
- * transaction in the log and moving th etail to
- * point to the middle of it will prevent
- * recovery from finding the start of the
- * transaction. Hence we should only update the
- * last_sync_lsn if this iclog contains
- * transaction completion callbacks on it.
- *
- * We have to do this before we drop the
- * icloglock to ensure we are the only one that
- * can update it.
- */
- ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
- be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
- if (!list_empty_careful(&iclog->ic_callbacks))
- atomic64_set(&log->l_last_sync_lsn,
- be64_to_cpu(iclog->ic_header.h_lsn));
-
- } else
- ioerrors++;
-
- spin_unlock(&log->l_icloglock);
-
- /*
- * Keep processing entries in the callback list until
- * we come around and it is empty. We need to
- * atomically see that the list is empty and change the
- * state to DIRTY so that we don't miss any more
- * callbacks being added.
- */
- spin_lock(&iclog->ic_callback_lock);
- while (!list_empty(&iclog->ic_callbacks)) {
- LIST_HEAD(tmp);
-
- list_splice_init(&iclog->ic_callbacks, &tmp);
-
- spin_unlock(&iclog->ic_callback_lock);
- xlog_cil_process_committed(&tmp, aborted);
- spin_lock(&iclog->ic_callback_lock);
- }
-
- loopdidcallbacks++;
- funcdidcallbacks++;
-
- spin_lock(&log->l_icloglock);
- spin_unlock(&iclog->ic_callback_lock);
- if (!(iclog->ic_state & XLOG_STATE_IOERROR))
- iclog->ic_state = XLOG_STATE_DIRTY;
-
- /*
- * Transition from DIRTY to ACTIVE if applicable.
- * NOP if STATE_IOERROR.
- */
- xlog_state_clean_log(log);
-
- /* wake up threads waiting in xfs_log_force() */
- wake_up_all(&iclog->ic_force_wait);
+ cycled_icloglock = true;
+ xlog_state_do_iclog_callbacks(log, iclog, aborted);
+ xlog_state_clean_iclog(log, iclog);
iclog = iclog->ic_next;
} while (first_iclog != iclog);
+ did_callbacks |= cycled_icloglock;
+
if (repeats > 5000) {
flushcnt += repeats;
repeats = 0;
@@ -2779,50 +2895,15 @@ xlog_state_do_callback(
"%s: possible infinite loop (%d iterations)",
__func__, flushcnt);
}
- } while (!ioerrors && loopdidcallbacks);
+ } while (!ioerror && cycled_icloglock);
-#ifdef DEBUG
- /*
- * Make one last gasp attempt to see if iclogs are being left in limbo.
- * If the above loop finds an iclog earlier than the current iclog and
- * in one of the syncing states, the current iclog is put into
- * DO_CALLBACK and the callbacks are deferred to the completion of the
- * earlier iclog. Walk the iclogs in order and make sure that no iclog
- * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
- * states.
- *
- * Note that SYNCING|IOABORT is a valid state so we cannot just check
- * for ic_state == SYNCING.
- */
- if (funcdidcallbacks) {
- first_iclog = iclog = log->l_iclog;
- do {
- ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
- /*
- * Terminate the loop if iclogs are found in states
- * which will cause other threads to clean up iclogs.
- *
- * SYNCING - i/o completion will go through logs
- * DONE_SYNC - interrupt thread should be waiting for
- * l_icloglock
- * IOERROR - give up hope all ye who enter here
- */
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state & XLOG_STATE_SYNCING ||
- iclog->ic_state == XLOG_STATE_DONE_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR )
- break;
- iclog = iclog->ic_next;
- } while (first_iclog != iclog);
- }
-#endif
+ if (did_callbacks)
+ xlog_state_callback_check_state(log);
if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
- wake = 1;
- spin_unlock(&log->l_icloglock);
-
- if (wake)
wake_up_all(&log->l_flush_wait);
+
+ spin_unlock(&log->l_icloglock);
}
@@ -3922,7 +4003,9 @@ xfs_log_force_umount(
* item committed callback functions will do this again under lock to
* avoid races.
*/
+ spin_lock(&log->l_cilp->xc_push_lock);
wake_up_all(&log->l_cilp->xc_commit_wait);
+ spin_unlock(&log->l_cilp->xc_push_lock);
xlog_state_do_callback(log, true, NULL);
#ifdef XFSERRORDEBUG
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index fa5602d0fd7f..ef652abd112c 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -38,7 +38,7 @@ xlog_cil_ticket_alloc(
struct xlog_ticket *tic;
tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
- KM_SLEEP|KM_NOFS);
+ KM_NOFS);
/*
* set the current reservation to zero so we know to steal the basic
@@ -186,7 +186,7 @@ xlog_cil_alloc_shadow_bufs(
*/
kmem_free(lip->li_lv_shadow);
- lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS);
+ lv = kmem_alloc_large(buf_size, KM_NOFS);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
lv->lv_item = lip;
@@ -660,7 +660,7 @@ xlog_cil_push(
if (!cil)
return 0;
- new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+ new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
new_ctx->ticket = xlog_cil_ticket_alloc(log);
down_write(&cil->xc_ctx_lock);
@@ -1179,11 +1179,11 @@ xlog_cil_init(
struct xfs_cil *cil;
struct xfs_cil_ctx *ctx;
- cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+ cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
if (!cil)
return -ENOMEM;
- ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+ ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
if (!ctx) {
kmem_free(cil);
return -ENOMEM;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13d1d3e95b88..508319039dce 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -97,6 +97,8 @@ xlog_alloc_buffer(
struct xlog *log,
int nbblks)
{
+ int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
+
/*
* Pass log block 0 since we don't have an addr yet, buffer will be
* verified on read.
@@ -125,7 +127,7 @@ xlog_alloc_buffer(
if (nbblks > 1 && log->l_sectBBsize > 1)
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL);
+ return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL);
}
/*
@@ -1960,7 +1962,7 @@ xlog_recover_buffer_pass1(
}
}
- bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+ bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
bcp->bc_blkno = buf_f->blf_blkno;
bcp->bc_len = buf_f->blf_len;
bcp->bc_refcount = 1;
@@ -2930,7 +2932,7 @@ xlog_recover_inode_pass2(
if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
in_f = item->ri_buf[0].i_addr;
} else {
- in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP);
+ in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
need_free = 1;
error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
if (error)
@@ -4161,7 +4163,7 @@ xlog_recover_add_item(
{
xlog_recover_item_t *item;
- item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+ item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
INIT_LIST_HEAD(&item->ri_list);
list_add_tail(&item->ri_list, head);
}
@@ -4201,7 +4203,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
+ ptr = kmem_realloc(old_ptr, len + old_len, 0);
memcpy(&ptr[old_len], dp, len);
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4261,7 +4263,7 @@ xlog_recover_add_to_trans(
return 0;
}
- ptr = kmem_alloc(len, KM_SLEEP);
+ ptr = kmem_alloc(len, 0);
memcpy(ptr, dp, len);
in_f = (struct xfs_inode_log_format *)ptr;
@@ -4289,7 +4291,7 @@ xlog_recover_add_to_trans(
item->ri_total = in_f->ilf_size;
item->ri_buf =
kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
- KM_SLEEP);
+ 0);
}
ASSERT(item->ri_total > item->ri_cnt);
/* Description region is ri_buf[0] */
@@ -4423,7 +4425,7 @@ xlog_recover_ophdr_to_trans(
* This is a new transaction so allocate a new recovery container to
* hold the recovery ops that will follow.
*/
- trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
+ trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
trans->r_log_tid = tid;
trans->r_lsn = be64_to_cpu(rhead->h_lsn);
INIT_LIST_HEAD(&trans->r_itemq);
@@ -5022,16 +5024,27 @@ xlog_recover_process_one_iunlink(
}
/*
- * xlog_iunlink_recover
+ * Recover AGI unlinked lists
+ *
+ * This is called during recovery to process any inodes which we unlinked but
+ * not freed when the system crashed. These inodes will be on the lists in the
+ * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
+ * any inodes found on the lists. Each inode is removed from the lists when it
+ * has been fully truncated and is freed. The freeing of the inode and its
+ * removal from the list must be atomic.
+ *
+ * If everything we touch in the agi processing loop is already in memory, this
+ * loop can hold the cpu for a long time. It runs without lock contention,
+ * memory allocation contention, the need wait for IO, etc, and so will run
+ * until we either run out of inodes to process, run low on memory or we run out
+ * of log space.
*
- * This is called during recovery to process any inodes which
- * we unlinked but not freed when the system crashed. These
- * inodes will be on the lists in the AGI blocks. What we do
- * here is scan all the AGIs and fully truncate and free any
- * inodes found on the lists. Each inode is removed from the
- * lists when it has been fully truncated and is freed. The
- * freeing of the inode and its removal from the list must be
- * atomic.
+ * This behaviour is bad for latency on single CPU and non-preemptible kernels,
+ * and can prevent other filesytem work (such as CIL pushes) from running. This
+ * can lead to deadlocks if the recovery process runs out of log reservation
+ * space. Hence we need to yield the CPU when there is other kernel work
+ * scheduled on this CPU to ensure other scheduled work can run without undue
+ * latency.
*/
STATIC void
xlog_recover_process_iunlinks(
@@ -5078,6 +5091,7 @@ xlog_recover_process_iunlinks(
while (agino != NULLAGINO) {
agino = xlog_recover_process_one_iunlink(mp,
agno, agino, bucket);
+ cond_resched();
}
}
xfs_buf_rele(agibp);
@@ -5527,7 +5541,7 @@ xlog_do_log_recovery(
*/
log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
sizeof(struct list_head),
- KM_SLEEP);
+ 0);
for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 322da6909290..ba5b6f3b2b88 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -82,7 +82,7 @@ xfs_uuid_mount(
if (hole < 0) {
xfs_uuid_table = kmem_realloc(xfs_uuid_table,
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
- KM_SLEEP);
+ 0);
hole = xfs_uuid_table_size++;
}
xfs_uuid_table[hole] = *uuid;
@@ -214,7 +214,7 @@ xfs_initialize_perag(
spin_lock(&mp->m_perag_lock);
if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
- BUG();
+ WARN_ON_ONCE(1);
spin_unlock(&mp->m_perag_lock);
radix_tree_preload_end();
error = -EEXIST;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4adb6837439a..fdb60e09a9c5 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -327,13 +327,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
}
/* per-AG block reservation data structures*/
-enum xfs_ag_resv_type {
- XFS_AG_RESV_NONE = 0,
- XFS_AG_RESV_AGFL,
- XFS_AG_RESV_METADATA,
- XFS_AG_RESV_RMAPBT,
-};
-
struct xfs_ag_resv {
/* number of blocks originally reserved here */
xfs_extlen_t ar_orig_reserved;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 74738813f60d..a06661dac5be 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -333,12 +333,12 @@ xfs_mru_cache_create(
if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
return -EINVAL;
- if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
+ if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
return -ENOMEM;
/* An extra list is needed to avoid reaping up to a grp_time early. */
mru->grp_count = grp_count + 1;
- mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+ mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
if (!mru->lists) {
err = -ENOMEM;
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 0c954cad7449..a339bd5fa260 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -32,7 +32,7 @@ xfs_break_leased_layouts(
struct xfs_inode *ip = XFS_I(inode);
int error;
- while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
+ while ((error = break_layout(inode, false)) == -EWOULDBLOCK) {
xfs_iunlock(ip, *iolock);
*did_unlock = true;
error = break_layout(inode, true);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5e7a37f0cf84..ecd8ce152ab1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -642,7 +642,7 @@ xfs_qm_init_quotainfo(
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+ qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0);
error = list_lru_init(&qinf->qi_lru);
if (error)
@@ -978,7 +978,7 @@ xfs_qm_reset_dqcounts_buf(
if (qip->i_d.di_nblocks == 0)
return 0;
- map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+ map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
lblkno = 0;
maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index d8288aa0670a..2328268e6245 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -144,9 +144,9 @@ xfs_cui_init(
ASSERT(nextents > 0);
if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
- KM_SLEEP);
+ 0);
else
- cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP);
+ cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
cuip->cui_format.cui_nextents = nextents;
@@ -223,7 +223,7 @@ xfs_trans_get_cud(
{
struct xfs_cud_log_item *cudp;
- cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
+ cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
&xfs_cud_item_ops);
cudp->cud_cuip = cuip;
@@ -555,26 +555,24 @@ xfs_cui_recover(
irec.br_blockcount = new_len;
switch (type) {
case XFS_REFCOUNT_INCREASE:
- error = xfs_refcount_increase_extent(tp, &irec);
+ xfs_refcount_increase_extent(tp, &irec);
break;
case XFS_REFCOUNT_DECREASE:
- error = xfs_refcount_decrease_extent(tp, &irec);
+ xfs_refcount_decrease_extent(tp, &irec);
break;
case XFS_REFCOUNT_ALLOC_COW:
- error = xfs_refcount_alloc_cow_extent(tp,
+ xfs_refcount_alloc_cow_extent(tp,
irec.br_startblock,
irec.br_blockcount);
break;
case XFS_REFCOUNT_FREE_COW:
- error = xfs_refcount_free_cow_extent(tp,
+ xfs_refcount_free_cow_extent(tp,
irec.br_startblock,
irec.br_blockcount);
break;
default:
ASSERT(0);
}
- if (error)
- goto abort_error;
requeue_only = true;
}
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index c4ec7afd1170..0f08153b4994 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -495,10 +495,8 @@ xfs_reflink_cancel_cow_blocks(
ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
/* Free the CoW orphan record. */
- error = xfs_refcount_free_cow_extent(*tpp,
- del.br_startblock, del.br_blockcount);
- if (error)
- break;
+ xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
+ del.br_blockcount);
xfs_bmap_add_free(*tpp, del.br_startblock,
del.br_blockcount, NULL);
@@ -675,15 +673,10 @@ xfs_reflink_end_cow_extent(
trace_xfs_reflink_cow_remap(ip, &del);
/* Free the CoW orphan record. */
- error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
- del.br_blockcount);
- if (error)
- goto out_cancel;
+ xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
/* Map the new blocks into the data fork. */
- error = xfs_bmap_map_extent(tp, ip, &del);
- if (error)
- goto out_cancel;
+ xfs_bmap_map_extent(tp, ip, &del);
/* Charge this new data fork mapping to the on-disk quota. */
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
@@ -1070,14 +1063,10 @@ xfs_reflink_remap_extent(
uirec.br_blockcount, uirec.br_startblock);
/* Update the refcount tree */
- error = xfs_refcount_increase_extent(tp, &uirec);
- if (error)
- goto out_cancel;
+ xfs_refcount_increase_extent(tp, &uirec);
/* Map the new blocks into the data fork. */
- error = xfs_bmap_map_extent(tp, ip, &uirec);
- if (error)
- goto out_cancel;
+ xfs_bmap_map_extent(tp, ip, &uirec);
/* Update quota accounting. */
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
@@ -1190,11 +1179,11 @@ xfs_reflink_remap_blocks(
}
/*
- * Grab the exclusive iolock for a data copy from src to dest, making
- * sure to abide vfs locking order (lowest pointer value goes first) and
- * breaking the pnfs layout leases on dest before proceeding. The loop
- * is needed because we cannot call the blocking break_layout() with the
- * src iolock held, and therefore have to back out both locks.
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
+ * layout leases before proceeding. The loop is needed because we cannot call
+ * the blocking break_layout() with the iolocks held, and therefore have to
+ * back out both locks.
*/
static int
xfs_iolock_two_inodes_and_break_layout(
@@ -1203,33 +1192,44 @@ xfs_iolock_two_inodes_and_break_layout(
{
int error;
-retry:
- if (src < dest) {
- inode_lock_shared(src);
- inode_lock_nested(dest, I_MUTEX_NONDIR2);
- } else {
- /* src >= dest */
- inode_lock(dest);
- }
+ if (src > dest)
+ swap(src, dest);
- error = break_layout(dest, false);
- if (error == -EWOULDBLOCK) {
- inode_unlock(dest);
- if (src < dest)
- inode_unlock_shared(src);
+retry:
+ /* Wait to break both inodes' layouts before we start locking. */
+ error = break_layout(src, true);
+ if (error)
+ return error;
+ if (src != dest) {
error = break_layout(dest, true);
if (error)
return error;
- goto retry;
}
+
+ /* Lock one inode and make sure nobody got in and leased it. */
+ inode_lock(src);
+ error = break_layout(src, false);
+ if (error) {
+ inode_unlock(src);
+ if (error == -EWOULDBLOCK)
+ goto retry;
+ return error;
+ }
+
+ if (src == dest)
+ return 0;
+
+ /* Lock the other inode and make sure nobody got in and leased it. */
+ inode_lock_nested(dest, I_MUTEX_NONDIR2);
+ error = break_layout(dest, false);
if (error) {
+ inode_unlock(src);
inode_unlock(dest);
- if (src < dest)
- inode_unlock_shared(src);
+ if (error == -EWOULDBLOCK)
+ goto retry;
return error;
}
- if (src > dest)
- inode_lock_shared_nested(src, I_MUTEX_NONDIR2);
+
return 0;
}
@@ -1247,10 +1247,10 @@ xfs_reflink_remap_unlock(
xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
if (!same_inode)
- xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
+ xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
inode_unlock(inode_out);
if (!same_inode)
- inode_unlock_shared(inode_in);
+ inode_unlock(inode_in);
}
/*
@@ -1325,7 +1325,7 @@ xfs_reflink_remap_prep(
if (same_inode)
xfs_ilock(src, XFS_MMAPLOCK_EXCL);
else
- xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest,
+ xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest,
XFS_MMAPLOCK_EXCL);
/* Check file eligibility and prepare for block sharing. */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 77ed557b6127..8939e0ea09cd 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -142,9 +142,9 @@ xfs_rui_init(
ASSERT(nextents > 0);
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
- ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+ ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
else
- ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
+ ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
ruip->rui_format.rui_nextents = nextents;
@@ -244,7 +244,7 @@ xfs_trans_get_rud(
{
struct xfs_rud_log_item *rudp;
- rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP);
+ rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
&xfs_rud_item_ops);
rudp->rud_ruip = ruip;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5fa4db3c3e32..4a48a8c75b4f 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -865,7 +865,7 @@ xfs_alloc_rsum_cache(
* lower bound on the minimum level with any free extents. We can
* continue without the cache if it couldn't be allocated.
*/
- mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP);
+ mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
if (!mp->m_rsum_cache)
xfs_warn(mp, "could not allocate realtime summary cache");
}
@@ -963,7 +963,7 @@ xfs_growfs_rt(
/*
* Allocate a new (fake) mount/sb.
*/
- nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+ nmp = kmem_alloc(sizeof(*nmp), 0);
/*
* Loop over the bitmap blocks.
* We will do everything one bitmap block at a time.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f9450235533c..8d1df9f8be07 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -818,7 +818,8 @@ xfs_init_mount_workqueues(
goto out_destroy_buf;
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
+ 0, mp->m_fsname);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
@@ -1663,6 +1664,8 @@ xfs_fs_fill_super(
sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
+ sb->s_time_min = S32_MIN;
+ sb->s_time_max = S32_MAX;
sb->s_iflags |= SB_I_CGROUPWB;
set_posix_acl_flag(sb);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8094b1920eef..eaae275ed430 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -23,6 +23,7 @@ struct xlog;
struct xlog_ticket;
struct xlog_recover;
struct xlog_recover_item;
+struct xlog_rec_header;
struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
@@ -30,6 +31,10 @@ struct xfs_btree_cur;
struct xfs_refcount_irec;
struct xfs_fsmap;
struct xfs_rmap_irec;
+struct xfs_icreate_log;
+struct xfs_owner_info;
+struct xfs_trans_res;
+struct xfs_inobt_rec_incore;
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -3575,6 +3580,35 @@ TRACE_EVENT(xfs_pwork_init,
__entry->nr_threads, __entry->pid)
)
+DECLARE_EVENT_CLASS(xfs_kmem_class,
+ TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
+ TP_ARGS(size, flags, caller_ip),
+ TP_STRUCT__entry(
+ __field(ssize_t, size)
+ __field(int, flags)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->size = size;
+ __entry->flags = flags;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("size %zd flags 0x%x caller %pS",
+ __entry->size,
+ __entry->flags,
+ (char *)__entry->caller_ip)
+)
+
+#define DEFINE_KMEM_EVENT(name) \
+DEFINE_EVENT(xfs_kmem_class, name, \
+ TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
+ TP_ARGS(size, flags, caller_ip))
+DEFINE_KMEM_EVENT(kmem_alloc);
+DEFINE_KMEM_EVENT(kmem_alloc_io);
+DEFINE_KMEM_EVENT(kmem_alloc_large);
+DEFINE_KMEM_EVENT(kmem_realloc);
+DEFINE_KMEM_EVENT(kmem_zone_alloc);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d42a68d8313b..f4795fdb7389 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -90,7 +90,7 @@ xfs_trans_dup(
trace_xfs_trans_dup(tp, _RET_IP_);
- ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+ ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
/*
* Initialize the new transaction structure.
@@ -263,7 +263,7 @@ xfs_trans_alloc(
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
- tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+ tp = kmem_zone_zalloc(xfs_trans_zone, 0);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 1027c9ca6eb8..16457465833b 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -863,7 +863,7 @@ STATIC void
xfs_trans_alloc_dqinfo(
xfs_trans_t *tp)
{
- tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
+ tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
}
void
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 3123b5aaad2a..cb895b1df5e4 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -30,7 +30,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
value = NULL;
}
- error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+ error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags);
if (error)
return error;
return asize;