diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 04:21:40 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 04:21:40 +0200 |
commit | 16d91548d1057691979de4686693f0ff92f46000 (patch) | |
tree | 0ab23001a138badebf7702416753c44e277e0ff7 /fs | |
parent | Merge branch 'next-general' of git://git.kernel.org/pub/scm/linux/kernel/git/... (diff) | |
parent | xfs: more lockdep whackamole with kmem_alloc* (diff) | |
download | linux-16d91548d1057691979de4686693f0ff92f46000.tar.xz linux-16d91548d1057691979de4686693f0ff92f46000.zip |
Merge tag 'xfs-5.8-merge-8' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs updates from Darrick Wong:
"Most of the changes this cycle are refactoring of existing code in
preparation for things landing in the future.
We also fixed various problems and deficiencies in the quota
implementation, and (I hope) the last of the stale read vectors by
forcing write allocations to go through the unwritten state until the
write completes.
Summary:
- Various cleanups to remove dead code, unnecessary conditionals,
asserts, etc.
- Fix a linker warning caused by xfs stuffing '-g' into CFLAGS
redundantly.
- Tighten up our dmesg logging to ensure that everything is prefixed
with 'XFS' for easier grepping.
- Kill a bunch of typedefs.
- Refactor the deferred ops code to reduce indirect function calls.
- Increase type-safety with the deferred ops code.
- Make the DAX mount options a tri-state.
- Fix some error handling problems in the inode flush code and clean
up other inode flush warts.
- Refactor log recovery so that each log item recovery functions now
live with the other log item processing code.
- Fix some SPDX forms.
- Fix quota counter corruption if the fs crashes after running
quotacheck but before any dquots get logged.
- Don't fail metadata verification on zero-entry attr leaf blocks,
since they're just part of the disk format now due to a historic
lack of log atomicity.
- Don't allow SWAPEXT between files with different [ugp]id when
quotas are enabled.
- Refactor inode fork reading and verification to run directly from
the inode-from-disk function. This means that we now actually
guarantee that _iget'ted inodes are totally verified and ready to
go.
- Move the incore inode fork format and extent counts to the ifork
structure.
- Scalability improvements by reducing cacheline pingponging in
struct xfs_mount.
- More scalability improvements by removing m_active_trans from the
hot path.
- Fix inode counter update sanity checking to run /only/ on debug
kernels.
- Fix longstanding inconsistency in what error code we return when a
program hits project quota limits (ENOSPC).
- Fix group quota returning the wrong error code when a program hits
group quota limits.
- Fix per-type quota limits and grace periods for group and project
quotas so that they actually work.
- Allow extension of individual grace periods.
- Refactor the non-reclaim inode radix tree walking code to remove a
bunch of stupid little functions and straighten out the
inconsistent naming schemes.
- Fix a bug in speculative preallocation where we measured a new
allocation based on the last extent mapping in the file instead of
looking farther for the last contiguous space allocation.
- Force delalloc writes to unwritten extents. This closes a stale
disk contents exposure vector if the system goes down before the
write completes.
- More lockdep whackamole"
* tag 'xfs-5.8-merge-8' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (129 commits)
xfs: more lockdep whackamole with kmem_alloc*
xfs: force writes to delalloc regions to unwritten
xfs: refactor xfs_iomap_prealloc_size
xfs: measure all contiguous previous extents for prealloc size
xfs: don't fail unwritten extent conversion on writeback due to edquot
xfs: rearrange xfs_inode_walk_ag parameters
xfs: straighten out all the naming around incore inode tree walks
xfs: move xfs_inode_ag_iterator to be closer to the perag walking code
xfs: use bool for done in xfs_inode_ag_walk
xfs: fix inode ag walk predicate function return values
xfs: refactor eofb matching into a single helper
xfs: remove __xfs_icache_free_eofblocks
xfs: remove flags argument from xfs_inode_ag_walk
xfs: remove xfs_inode_ag_iterator_flags
xfs: remove unused xfs_inode_ag_iterator function
xfs: replace open-coded XFS_ICI_NO_TAG
xfs: move eofblocks conversion function to xfs_ioctl.c
xfs: allow individual quota grace period extension
xfs: per-type quota timers and warn limits
xfs: switch xfs_get_defquota to take explicit type
...
Diffstat (limited to 'fs')
101 files changed, 4241 insertions, 4814 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 4f95df476181..04611a1068b4 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -7,8 +7,6 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events ccflags-y += -I $(srctree)/$(src)/libxfs -ccflags-$(CONFIG_XFS_DEBUG) += -g - obj-$(CONFIG_XFS_FS) += xfs.o # this one should be compiled first, as the tracing macros can easily blow up @@ -101,9 +99,12 @@ xfs-y += xfs_log.o \ xfs_log_cil.o \ xfs_bmap_item.o \ xfs_buf_item.o \ + xfs_buf_item_recover.o \ + xfs_dquot_item_recover.o \ xfs_extfree_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_inode_item_recover.o \ xfs_refcount_item.o \ xfs_rmap_item.o \ xfs_log_recover.o \ diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 6143117770e9..34cbcfde9228 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -19,6 +19,7 @@ typedef unsigned __bitwise xfs_km_flags_t; #define KM_NOFS ((__force xfs_km_flags_t)0x0004u) #define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) #define KM_ZERO ((__force xfs_km_flags_t)0x0010u) +#define KM_NOLOCKDEP ((__force xfs_km_flags_t)0x0020u) /* * We use a special process flag to avoid recursive callbacks into @@ -30,7 +31,7 @@ kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO)); + BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP)); lflags = GFP_KERNEL | __GFP_NOWARN; if (flags & KM_NOFS) @@ -49,6 +50,9 @@ kmem_flags_convert(xfs_km_flags_t flags) if (flags & KM_ZERO) lflags |= __GFP_ZERO; + if (flags & KM_NOLOCKDEP) + lflags |= __GFP_NOLOCKDEP; + return lflags; } diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index c0352edc8e41..f3fd0ee9a7f7 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index a851bf77f17b..6c22b12176b8 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 047f09f0be3c..a5b998e950fe 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e4fe3dca9883..3b1bd6e112f8 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -61,8 +61,8 @@ xfs_inode_hasattr( struct xfs_inode *ip) { if (!XFS_IFORK_Q(ip) || - (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_anextents == 0)) + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0)) return 0; return 1; } @@ -84,7 +84,7 @@ xfs_attr_get_ilocked( if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; - if (args->dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); if (xfs_bmap_one_block(args->dp, XFS_ATTR_FORK)) return xfs_attr_leaf_get(args); @@ -212,14 +212,14 @@ xfs_attr_set_args( * If the attribute list is non-existent or a shortform list, * upgrade it to a single-leaf-block attribute list. */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL || + (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + dp->i_afp->if_nextents == 0)) { /* * Build initial attribute list (if required). */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) + if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) xfs_attr_shortform_create(args); /* @@ -272,7 +272,7 @@ xfs_attr_remove_args( if (!xfs_inode_hasattr(dp)) { error = -ENOATTR; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + } else if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) { ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); error = xfs_attr_shortform_remove(args); } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 0d2d05908537..db4717657ca1 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 863444e2dda7..2f7e89e4be3e 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -309,14 +309,6 @@ xfs_attr3_leaf_verify( return fa; /* - * In recovery there is a transient state where count == 0 is valid - * because we may have transitioned an empty shortform attr to a leaf - * if the attr didn't fit in shortform. - */ - if (!xfs_log_in_recovery(mp) && ichdr.count == 0) - return __this_address; - - /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -331,6 +323,13 @@ xfs_attr3_leaf_verify( (char *)bp->b_addr + ichdr.firstused) return __this_address; + /* + * NOTE: This verifier historically failed empty leaf buffers because + * we expect the fork to be in another format. Empty attr fork format + * conversions are possible during xattr set, however, and format + * conversion is not atomic with the xattr set that triggers it. We + * cannot assume leaf blocks are non-empty until that is addressed. + */ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; for (i = 0, ent = entries; i < ichdr.count; ent++, i++) { fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr, @@ -489,7 +488,7 @@ xfs_attr_copy_value( } if (!args->value) { - args->value = kmem_alloc_large(valuelen, 0); + args->value = kmem_alloc_large(valuelen, KM_NOLOCKDEP); if (!args->value) return -ENOMEM; } @@ -539,7 +538,7 @@ xfs_attr_shortform_bytesfit( /* rounded down */ offset = (XFS_LITINO(mp) - bytes) >> 3; - if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) { + if (dp->i_df.if_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; return (offset >= minforkoff) ? minforkoff : 0; } @@ -567,7 +566,7 @@ xfs_attr_shortform_bytesfit( dsize = dp->i_df.if_bytes; - switch (dp->i_d.di_format) { + switch (dp->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: /* * If there is no attr fork and the data fork is extents, @@ -636,22 +635,19 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) * Create the initial contents of a shortform attribute list. */ void -xfs_attr_shortform_create(xfs_da_args_t *args) +xfs_attr_shortform_create( + struct xfs_da_args *args) { - xfs_attr_sf_hdr_t *hdr; - xfs_inode_t *dp; - struct xfs_ifork *ifp; + struct xfs_inode *dp = args->dp; + struct xfs_ifork *ifp = dp->i_afp; + struct xfs_attr_sf_hdr *hdr; trace_xfs_attr_sf_create(args); - dp = args->dp; - ASSERT(dp != NULL); - ifp = dp->i_afp; - ASSERT(ifp != NULL); ASSERT(ifp->if_bytes == 0); - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) { + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) { ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */ - dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL; + ifp->if_format = XFS_DINODE_FMT_LOCAL; ifp->if_flags |= XFS_IFINLINE; } else { ASSERT(ifp->if_flags & XFS_IFINLINE); @@ -719,13 +715,12 @@ xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - ip->i_d.di_forkoff = 0; - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - - ASSERT(ip->i_d.di_anextents == 0); - ASSERT(ip->i_afp == NULL); + ASSERT(ip->i_afp->if_nextents == 0); + xfs_idestroy_fork(ip->i_afp); + kmem_cache_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + ip->i_d.di_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -775,7 +770,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && (mp->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & XFS_DA_OP_ADDNAME)) { xfs_attr_fork_remove(dp, args->trans); } else { @@ -785,7 +780,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || (args->op_flags & XFS_DA_OP_ADDNAME) || !(mp->m_flags & XFS_MOUNT_ATTR2) || - dp->i_d.di_format == XFS_DINODE_FMT_BTREE); + dp->i_df.if_format == XFS_DINODE_FMT_BTREE); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -962,7 +957,7 @@ xfs_attr_shortform_allfit( + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); @@ -981,7 +976,7 @@ xfs_attr_shortform_verify( int i; int64_t size; - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL); + ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL); ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = ifp->if_bytes; @@ -1085,7 +1080,7 @@ xfs_attr3_leaf_to_shortform( if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); - ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); + ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); goto out; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 6dd2d937a42a..5be6be309302 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 6fb4572845ce..e1144f22b005 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index aafa4fe70624..bb004fb7944a 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h index 99017b8df292..a04f266ae644 100644 --- a/fs/xfs/libxfs/xfs_bit.h +++ b/fs/xfs/libxfs/xfs_bit.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index fda13cd7add0..667cdd0dfdf4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -61,10 +61,10 @@ xfs_bmap_compute_maxlevels( int sz; /* root block size */ /* - * The maximum number of extents in a file, hence the maximum - * number of leaf entries, is controlled by the type of di_nextents - * (a signed 32-bit number, xfs_extnum_t), or by di_anextents - * (a signed 16-bit number, xfs_aextnum_t). + * The maximum number of extents in a file, hence the maximum number of + * leaf entries, is controlled by the size of the on-disk extent count, + * either a signed 32-bit number for the data fork, or a signed 16-bit + * number for the attr fork. * * Note that we can no longer assume that if we are in ATTR1 that * the fork offset of all the inodes will be @@ -120,10 +120,11 @@ xfs_bmbt_lookup_first( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + return whichfork != XFS_COW_FORK && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > - XFS_IFORK_MAXEXT(ip, whichfork); + ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork); } /* @@ -131,10 +132,11 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + return whichfork != XFS_COW_FORK && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork); + ifp->if_format == XFS_DINODE_FMT_BTREE && + ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork); } /* @@ -213,8 +215,8 @@ xfs_bmap_forkoff_reset( int whichfork) { if (whichfork == XFS_ATTR_FORK && - ip->i_d.di_format != XFS_DINODE_FMT_DEV && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + ip->i_df.if_format != XFS_DINODE_FMT_DEV && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; if (dfl_forkoff > ip->i_d.di_forkoff) @@ -315,31 +317,28 @@ xfs_bmap_check_leaf_extents( xfs_inode_t *ip, /* incore inode pointer */ int whichfork) /* data or attr fork */ { + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ xfs_extnum_t i=0, j; /* index into the extents list */ - struct xfs_ifork *ifp; /* fork structure */ int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ __be64 *pp; /* pointer to block address */ xfs_bmbt_rec_t *ep; /* pointer to current extent */ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ xfs_bmbt_rec_t *nextp; /* pointer to next extent */ int bp_release = 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + if (ifp->if_format != XFS_DINODE_FMT_BTREE) return; - } /* skip large extent count inodes */ - if (ip->i_d.di_nextents > 10000) + if (ip->i_df.if_nextents > 10000) return; bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); block = ifp->if_broot; /* * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. @@ -604,7 +603,7 @@ xfs_bmap_btree_to_extents( ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); @@ -632,7 +631,7 @@ xfs_bmap_btree_to_extents( xfs_iroot_realloc(ip, -1, whichfork); ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -668,7 +667,7 @@ xfs_bmap_extents_to_btree( mp = ip->i_mount; ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS); /* * Make space in the inode incore. This needs to be undone if we fail @@ -692,7 +691,7 @@ xfs_bmap_extents_to_btree( /* * Convert to a btree with two levels, one record in root. */ - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + ifp->if_format = XFS_DINODE_FMT_BTREE; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; @@ -750,7 +749,7 @@ xfs_bmap_extents_to_btree( xfs_bmbt_disk_set_all(arp, &rec); cnt++; } - ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + ASSERT(cnt == ifp->if_nextents); xfs_btree_set_numrecs(ablock, cnt); /* @@ -778,7 +777,7 @@ out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); out_root_realloc: xfs_iroot_realloc(ip, -1, whichfork); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -800,16 +799,16 @@ xfs_bmap_local_to_extents_empty( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(whichfork != XFS_COW_FORK); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_bytes == 0); - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + ASSERT(ifp->if_nextents == 0); xfs_bmap_forkoff_reset(ip, whichfork); ifp->if_flags &= ~XFS_IFINLINE; ifp->if_flags |= XFS_IFEXTENTS; ifp->if_u1.if_root = NULL; ifp->if_height = 0; - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -840,7 +839,7 @@ xfs_bmap_local_to_extents( */ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK)); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { xfs_bmap_local_to_extents_empty(tp, ip, whichfork); @@ -907,7 +906,7 @@ xfs_bmap_local_to_extents( xfs_iext_first(ifp, &icur); xfs_iext_insert(ip, &icur, &rec, 0); - XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ifp->if_nextents = 1; ip->i_d.di_nblocks = 1; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); @@ -972,7 +971,8 @@ xfs_bmap_add_attrfork_extents( xfs_btree_cur_t *cur; /* bmap btree cursor */ int error; /* error return value */ - if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) + if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <= + XFS_IFORK_DSIZE(ip)) return 0; cur = NULL; error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, @@ -1033,7 +1033,7 @@ xfs_bmap_set_attrforkoff( int size, int *version) { - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_DEV: ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; break; @@ -1091,17 +1091,6 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; - if (XFS_IS_CORRUPT(mp, ip->i_d.di_anextents != 0)) { - error = -EFSCORRUPTED; - goto trans_cancel; - } - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { - /* - * For inodes coming from pre-6.2 filesystems. - */ - ASSERT(ip->i_d.di_aformat == 0); - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - } xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1110,9 +1099,10 @@ xfs_bmap_add_attrfork( goto trans_cancel; ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); + ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_LOCAL: error = xfs_bmap_add_attrfork_local(tp, ip, &logflags); break; @@ -1183,13 +1173,13 @@ xfs_iread_bmbt_block( xfs_extnum_t num_recs; xfs_extnum_t j; int whichfork = cur->bc_ino.whichfork; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); block = xfs_btree_get_block(cur, level, &bp); /* Abort if we find more records than nextents. */ num_recs = xfs_btree_get_numrecs(block); - if (unlikely(ir->loaded + num_recs > - XFS_IFORK_NEXTENTS(ip, whichfork))) { + if (unlikely(ir->loaded + num_recs > ifp->if_nextents)) { xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).", (unsigned long long)ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block, @@ -1215,7 +1205,7 @@ xfs_iread_bmbt_block( xfs_bmap_fork_to_state(whichfork)); trace_xfs_read_extent(ip, &ir->icur, xfs_bmap_fork_to_state(whichfork), _THIS_IP_); - xfs_iext_next(XFS_IFORK_PTR(ip, whichfork), &ir->icur); + xfs_iext_next(ifp, &ir->icur); } return 0; @@ -1238,9 +1228,7 @@ xfs_iread_extents( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_CORRUPT(mp, - XFS_IFORK_FORMAT(ip, whichfork) != - XFS_DINODE_FMT_BTREE)) { + if (XFS_IS_CORRUPT(mp, ifp->if_format != XFS_DINODE_FMT_BTREE)) { error = -EFSCORRUPTED; goto out; } @@ -1254,8 +1242,7 @@ xfs_iread_extents( if (error) goto out; - if (XFS_IS_CORRUPT(mp, - ir.loaded != XFS_IFORK_NEXTENTS(ip, whichfork))) { + if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) { error = -EFSCORRUPTED; goto out; } @@ -1289,14 +1276,13 @@ xfs_bmap_first_unused( xfs_fileoff_t lowest, max; int error; - ASSERT(xfs_ifork_has_extents(ip, whichfork) || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { *first_unused = 0; return 0; } + ASSERT(xfs_ifork_has_extents(ifp)); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); if (error) @@ -1337,7 +1323,7 @@ xfs_bmap_last_before( struct xfs_iext_cursor icur; int error; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: *last_block = 0; return 0; @@ -1436,16 +1422,17 @@ xfs_bmap_last_offset( xfs_fileoff_t *last_block, int whichfork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec rec; int is_empty; int error; *last_block = 0; - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) return 0; - if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ip, whichfork))) + if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); @@ -1463,23 +1450,22 @@ xfs_bmap_last_offset( */ int /* 1=>1 block, 0=>otherwise */ xfs_bmap_one_block( - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ + struct xfs_inode *ip, /* incore inode */ + int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp; /* inode fork pointer */ - int rval; /* return value */ - xfs_bmbt_irec_t s; /* internal version of extent */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int rval; /* return value */ + struct xfs_bmbt_irec s; /* internal version of extent */ struct xfs_iext_cursor icur; #ifndef DEBUG if (whichfork == XFS_DATA_FORK) return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; #endif /* !DEBUG */ - if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + if (ifp->if_nextents != 1) return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) return 0; - ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_flags & XFS_IFEXTENTS); xfs_iext_first(ifp, &icur); xfs_iext_get_extent(ifp, &icur, &s); @@ -1501,10 +1487,11 @@ xfs_bmap_add_extent_delay_real( struct xfs_bmalloca *bma, int whichfork) { + struct xfs_mount *mp = bma->ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ - struct xfs_ifork *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ @@ -1514,16 +1501,9 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ int tmp_rval; /* partial logging flags */ - struct xfs_mount *mp; - xfs_extnum_t *nextents; struct xfs_bmbt_irec old; - mp = bma->ip->i_mount; - ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(whichfork != XFS_ATTR_FORK); - nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : - &bma->ip->i_d.di_nextents); - ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); @@ -1614,7 +1594,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_remove(bma->ip, &bma->icur, state); xfs_iext_prev(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); - (*nextents)--; + ifp->if_nextents--; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1718,8 +1698,8 @@ xfs_bmap_add_extent_delay_real( PREV.br_startblock = new->br_startblock; PREV.br_state = new->br_state; xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + ifp->if_nextents++; - (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1784,7 +1764,8 @@ xfs_bmap_add_extent_delay_real( * The left neighbor is not contiguous. */ xfs_iext_update_extent(bma->ip, state, &bma->icur, new); - (*nextents)++; + ifp->if_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1870,7 +1851,8 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ xfs_iext_update_extent(bma->ip, state, &bma->icur, new); - (*nextents)++; + ifp->if_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1955,7 +1937,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_next(ifp, &bma->icur); xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state); xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state); - (*nextents)++; + ifp->if_nextents++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -2159,8 +2141,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 2); + ifp->if_nextents -= 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2212,8 +2193,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2255,9 +2235,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &PREV); + ifp->if_nextents--; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2364,8 +2343,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_insert(ip, icur, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; + if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2440,9 +2419,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, new, state); + ifp->if_nextents++; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2493,9 +2471,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &r[1], state); xfs_iext_insert(ip, icur, &r[0], state); + ifp->if_nextents += 2; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2810,9 +2787,8 @@ xfs_bmap_add_extent_hole_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &left); + ifp->if_nextents--; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { @@ -2910,8 +2886,8 @@ xfs_bmap_add_extent_hole_real( * Insert a new entry. */ xfs_iext_insert(ip, icur, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; + if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { @@ -3891,7 +3867,8 @@ xfs_bmapi_read( int flags) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; + int whichfork = xfs_bmapi_whichfork(flags); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec got; xfs_fileoff_t obno; xfs_fileoff_t end; @@ -3899,48 +3876,23 @@ xfs_bmapi_read( int error; bool eof = false; int n = 0; - int whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); - ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| - XFS_BMAPI_COWFORK))); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE))); ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + if (WARN_ON_ONCE(!ifp)) + return -EFSCORRUPTED; + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) return -EFSCORRUPTED; - } if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; XFS_STATS_INC(mp, xs_blk_mapr); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!ifp) { - /* No CoW fork? Return a hole. */ - if (whichfork == XFS_COW_FORK) { - mval->br_startoff = bno; - mval->br_startblock = HOLESTARTBLOCK; - mval->br_blockcount = len; - mval->br_state = XFS_EXT_NORM; - *nmap = 1; - return 0; - } - - /* - * A missing attr ifork implies that the inode says we're in - * extents or btree format but failed to pass the inode fork - * verifier while trying to load it. Treat that as a file - * corruption too. - */ -#ifdef DEBUG - xfs_alert(mp, "%s: inode %llu missing fork %d", - __func__, ip->i_ino, whichfork); -#endif /* DEBUG */ - return -EFSCORRUPTED; - } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); if (error) @@ -4193,17 +4145,7 @@ xfs_bmapi_allocate( bma->got.br_blockcount = bma->length; bma->got.br_state = XFS_EXT_NORM; - /* - * In the data fork, a wasdelay extent has been initialized, so - * shouldn't be flagged as unwritten. - * - * For the cow fork, however, we convert delalloc reservations - * (extents allocated for speculative preallocation) to - * allocated unwritten extents, and only convert the unwritten - * extents to real extents when we're about to write the data. - */ - if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && - (bma->flags & XFS_BMAPI_PREALLOC)) + if (bma->flags & XFS_BMAPI_PREALLOC) bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) @@ -4317,11 +4259,13 @@ xfs_bmapi_minleft( struct xfs_inode *ip, int fork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, fork); + if (tp && tp->t_firstblock != NULLFSBLOCK) return 0; - if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE) + if (ifp->if_format != XFS_DINODE_FMT_BTREE) return 1; - return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1; + return be16_to_cpu(ifp->if_broot->bb_level) + 1; } /* @@ -4336,11 +4280,13 @@ xfs_bmapi_finish( int whichfork, int error) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + if ((bma->logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + ifp->if_format != XFS_DINODE_FMT_EXTENTS) bma->logflags &= ~xfs_ilog_fext(whichfork); else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE) + ifp->if_format != XFS_DINODE_FMT_BTREE) bma->logflags &= ~xfs_ilog_fbroot(whichfork); if (bma->logflags) @@ -4372,13 +4318,13 @@ xfs_bmapi_write( .total = total, }; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; + int whichfork = xfs_bmapi_whichfork(flags); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ - int whichfork; /* data or attr fork */ #ifdef DEBUG xfs_fileoff_t orig_bno; /* original block number value */ @@ -4393,13 +4339,12 @@ xfs_bmapi_write( orig_mval = mval; orig_nmap = *nmap; #endif - whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(tp != NULL); ASSERT(len > 0); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & XFS_BMAPI_REMAP)); @@ -4415,7 +4360,7 @@ xfs_bmapi_write( ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) != (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4423,8 +4368,6 @@ xfs_bmapi_write( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - ifp = XFS_IFORK_PTR(ip, whichfork); - XFS_STATS_INC(mp, xs_blk_mapw); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -4534,9 +4477,8 @@ xfs_bmapi_write( if (error) goto error0; - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || - XFS_IFORK_NEXTENTS(ip, whichfork) > - XFS_IFORK_MAXEXT(ip, whichfork)); + ASSERT(ifp->if_format != XFS_DINODE_FMT_BTREE || + ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, orig_nmap, *nmap); @@ -4611,8 +4553,23 @@ xfs_bmapi_convert_delalloc( bma.offset = bma.got.br_startoff; bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + + /* + * When we're converting the delalloc reservations backing dirty pages + * in the page cache, we must be careful about how we create the new + * extents: + * + * New CoW fork extents are created unwritten, turned into real extents + * when we're about to write the data to disk, and mapped into the data + * fork after the write finishes. End of story. + * + * New data fork extents must be mapped in as unwritten and converted + * to real extents after the write succeeds to avoid exposing stale + * disk contents if we crash. + */ + bma.flags = XFS_BMAPI_PREALLOC; if (whichfork == XFS_COW_FORK) - bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + bma.flags |= XFS_BMAPI_COWFORK; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; @@ -4682,7 +4639,7 @@ xfs_bmapi_remap( ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4726,9 +4683,9 @@ xfs_bmapi_remap( error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: - if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) + if (ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS) logflags &= ~XFS_ILOG_DEXT; - else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + else if (ip->i_df.if_format != XFS_DINODE_FMT_BTREE) logflags &= ~XFS_ILOG_DBROOT; if (logflags) @@ -5078,9 +5035,8 @@ xfs_bmap_del_extent_real( * conversion to btree format, since the transaction will be dirty then. */ if (tp->t_blk_res == 0 && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= - XFS_IFORK_MAXEXT(ip, whichfork) && + ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents >= XFS_IFORK_MAXEXT(ip, whichfork) && del->br_startoff > got.br_startoff && del_endoff < got_endoff) return -ENOSPC; @@ -5132,8 +5088,8 @@ xfs_bmap_del_extent_real( */ xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; + flags |= XFS_ILOG_CORE; if (!cur) { flags |= xfs_ilog_fext(whichfork); @@ -5241,8 +5197,8 @@ xfs_bmap_del_extent_real( } } else flags |= xfs_ilog_fext(whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + + ifp->if_nextents++; xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); break; @@ -5322,7 +5278,7 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork))) + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -5360,7 +5316,7 @@ __xfs_bunmapi( logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_ino.flags = 0; } else @@ -5605,10 +5561,10 @@ error0: * logging the extent records if we've converted to btree format. */ if ((logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + ifp->if_format != XFS_DINODE_FMT_EXTENTS) logflags &= ~xfs_ilog_fext(whichfork); else if ((logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + ifp->if_format != XFS_DINODE_FMT_BTREE) logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log inode even in the error case, if the transaction @@ -5690,6 +5646,7 @@ xfs_bmse_merge( struct xfs_btree_cur *cur, int *logflags) /* output */ { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; @@ -5708,8 +5665,7 @@ xfs_bmse_merge( * Update the on-disk extent count, the btree if necessary and log the * inode. */ - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; *logflags |= XFS_ILOG_CORE; if (!cur) { *logflags |= XFS_ILOG_DEXT; @@ -5747,7 +5703,7 @@ xfs_bmse_merge( done: xfs_iext_remove(ip, icur, 0); - xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); + xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur, &new); @@ -5819,7 +5775,7 @@ xfs_bmap_collapse_extents( int error = 0; int logflags = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5936,7 +5892,7 @@ xfs_bmap_insert_extents( int error = 0; int logflags = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -6030,18 +5986,18 @@ xfs_bmap_split_extent( xfs_fileoff_t split_fsb) { int whichfork = XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; xfs_fsblock_t gotblkcnt; /* new block count for got */ struct xfs_iext_cursor icur; int error = 0; int logflags = 0; int i = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -6049,7 +6005,6 @@ xfs_bmap_split_extent( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); @@ -6097,8 +6052,7 @@ xfs_bmap_split_extent( /* Add new extent */ xfs_iext_next(ifp, &icur); xfs_iext_insert(ip, &icur, &new, 0); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; if (cur) { error = xfs_bmbt_lookup_eq(cur, &new, &i); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f3259ad5c22c..6028a3c825ba 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 295a59cf8840..d9c63f17d2de 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -636,10 +636,7 @@ xfs_bmbt_change_owner( ASSERT(tp || buffer_list); ASSERT(!(tp && buffer_list)); - if (whichfork == XFS_DATA_FORK) - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE); - else - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE); + ASSERT(XFS_IFORK_PTR(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 29b407d053b4..72bf74c79fb9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 8626c5a81aad..10e50cbacacf 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 53e503b6f186..6e25de6621e4 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 08c0a4d98b89..059ac108b1b3 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 22557527cfdb..d8f586256add 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -178,6 +178,18 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, }; +static void +xfs_defer_create_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp, + bool sort) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + + dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, + dfp->dfp_count, sort); +} + /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of @@ -187,17 +199,11 @@ STATIC void xfs_defer_create_intents( struct xfs_trans *tp) { - struct list_head *li; struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count); trace_xfs_defer_create_intent(tp->t_mountp, dfp); - list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items); - list_for_each(li, &dfp->dfp_work) - ops->log_item(tp, dfp->dfp_intent, li); + xfs_defer_create_intent(tp, dfp, true); } } @@ -234,10 +240,13 @@ xfs_defer_trans_roll( struct xfs_log_item *lip; struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS]; struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES]; + unsigned int ordered = 0; /* bitmap */ int bpcount = 0, ipcount = 0; int i; int error; + BUILD_BUG_ON(NBBY * sizeof(ordered) < XFS_DEFER_OPS_NR_BUFS); + list_for_each_entry(lip, &tp->t_items, li_trans) { switch (lip->li_type) { case XFS_LI_BUF: @@ -248,7 +257,10 @@ xfs_defer_trans_roll( ASSERT(0); return -EFSCORRUPTED; } - xfs_trans_dirty_buf(tp, bli->bli_buf); + if (bli->bli_flags & XFS_BLI_ORDERED) + ordered |= (1U << bpcount); + else + xfs_trans_dirty_buf(tp, bli->bli_buf); bplist[bpcount++] = bli->bli_buf; } break; @@ -289,6 +301,8 @@ xfs_defer_trans_roll( /* Rejoin the buffers and dirty them so the log moves forward. */ for (i = 0; i < bpcount; i++) { xfs_trans_bjoin(tp, bplist[i]); + if (ordered & (1U << i)) + xfs_trans_ordered_buf(tp, bplist[i]); xfs_trans_bhold(tp, bplist[i]); } @@ -346,6 +360,53 @@ xfs_defer_cancel_list( } /* + * Log an intent-done item for the first pending intent, and finish the work + * items. + */ +static int +xfs_defer_finish_one( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_btree_cur *state = NULL; + struct list_head *li, *n; + int error; + + trace_xfs_defer_pending_finish(tp->t_mountp, dfp); + + dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + list_for_each_safe(li, n, &dfp->dfp_work) { + list_del(li); + dfp->dfp_count--; + error = ops->finish_item(tp, dfp->dfp_done, li, &state); + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction; put the work item + * back on the list and log a new log intent item to + * replace the old one. See "Requesting a Fresh + * Transaction while Finishing Deferred Work" above. + */ + list_add(li, &dfp->dfp_work); + dfp->dfp_count++; + dfp->dfp_done = NULL; + xfs_defer_create_intent(tp, dfp, false); + } + + if (error) + goto out; + } + + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); +out: + if (ops->finish_cleanup) + ops->finish_cleanup(tp, state, error); + return error; +} + +/* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if * one has even happened), rolling the transaction, and finishing the @@ -358,11 +419,7 @@ xfs_defer_finish_noroll( struct xfs_trans **tp) { struct xfs_defer_pending *dfp; - struct list_head *li; - struct list_head *n; - void *state; int error = 0; - const struct xfs_defer_op_type *ops; LIST_HEAD(dop_pending); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -371,87 +428,30 @@ xfs_defer_finish_noroll( /* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { - /* log intents and pull in intake items */ xfs_defer_create_intents(*tp); list_splice_tail_init(&(*tp)->t_dfops, &dop_pending); - /* - * Roll the transaction. - */ error = xfs_defer_trans_roll(tp); if (error) - goto out; + goto out_shutdown; - /* Log an intent-done item for the first pending item. */ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); - dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent, - dfp->dfp_count); - - /* Finish the work items. */ - state = NULL; - list_for_each_safe(li, n, &dfp->dfp_work) { - list_del(li); - dfp->dfp_count--; - error = ops->finish_item(*tp, li, dfp->dfp_done, - &state); - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction; - * put the work item back on the list - * and jump out. - */ - list_add(li, &dfp->dfp_work); - dfp->dfp_count++; - break; - } else if (error) { - /* - * Clean up after ourselves and jump out. - * xfs_defer_cancel will take care of freeing - * all these lists and stuff. - */ - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - goto out; - } - } - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction, so log a - * new log intent item to replace the old one - * and roll the transaction. See "Requesting - * a Fresh Transaction while Finishing - * Deferred Work" above. - */ - dfp->dfp_intent = ops->create_intent(*tp, - dfp->dfp_count); - dfp->dfp_done = NULL; - list_for_each(li, &dfp->dfp_work) - ops->log_item(*tp, dfp->dfp_intent, li); - } else { - /* Done with the dfp, free it. */ - list_del(&dfp->dfp_list); - kmem_free(dfp); - } - - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - } - -out: - if (error) { - xfs_defer_trans_abort(*tp, &dop_pending); - xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); - trace_xfs_defer_finish_error(*tp, error); - xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); - xfs_defer_cancel(*tp); - return error; + error = xfs_defer_finish_one(*tp, dfp); + if (error && error != -EAGAIN) + goto out_shutdown; } trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; + +out_shutdown: + xfs_defer_trans_abort(*tp, &dop_pending); + xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); + trace_xfs_defer_finish_error(*tp, error); + xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); + xfs_defer_cancel(*tp); + return error; } int diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7c28d7608ac6..6b2ca580f2b0 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> @@ -6,6 +6,7 @@ #ifndef __XFS_DEFER_H__ #define __XFS_DEFER_H__ +struct xfs_btree_cur; struct xfs_defer_op_type; /* @@ -28,8 +29,8 @@ enum xfs_defer_ops_type { struct xfs_defer_pending { struct list_head dfp_list; /* pending items */ struct list_head dfp_work; /* work items */ - void *dfp_intent; /* log intent item */ - void *dfp_done; /* log done item */ + struct xfs_log_item *dfp_intent; /* log intent item */ + struct xfs_log_item *dfp_done; /* log done item */ unsigned int dfp_count; /* # extent items */ enum xfs_defer_ops_type dfp_type; }; @@ -43,15 +44,16 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { - void (*abort_intent)(void *); - void *(*create_done)(struct xfs_trans *, void *, unsigned int); - int (*finish_item)(struct xfs_trans *, struct list_head *, void *, - void **); - void (*finish_cleanup)(struct xfs_trans *, void *, int); - void (*cancel_item)(struct list_head *); - int (*diff_items)(void *, struct list_head *, struct list_head *); - void *(*create_intent)(struct xfs_trans *, uint); - void (*log_item)(struct xfs_trans *, void *, struct list_head *); + struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, + struct list_head *items, unsigned int count, bool sort); + void (*abort_intent)(struct xfs_log_item *intent); + struct xfs_log_item *(*create_done)(struct xfs_trans *tp, + struct xfs_log_item *intent, unsigned int count); + int (*finish_item)(struct xfs_trans *tp, struct xfs_log_item *done, + struct list_head *item, struct xfs_btree_cur **state); + void (*finish_cleanup)(struct xfs_trans *tp, + struct xfs_btree_cur *state, int error); + void (*cancel_item)(struct list_head *item); unsigned int max_items; }; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index dd6fcaaea318..612a9c5e41b1 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -278,7 +278,7 @@ xfs_dir_createname( if (!inum) args->op_flags |= XFS_DA_OP_JUSTCHECK; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_addname(args); goto out_free; } @@ -373,7 +373,7 @@ xfs_dir_lookup( args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_lookup(args); goto out_check_rval; } @@ -443,7 +443,7 @@ xfs_dir_removename( args->whichfork = XFS_DATA_FORK; args->trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_removename(args); goto out_free; } @@ -504,7 +504,7 @@ xfs_dir_replace( args->whichfork = XFS_DATA_FORK; args->trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_replace(args); goto out_free; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 033777e282f2..e55378640b05 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 1dbf2f980a26..5b59d3f7746b 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1104,7 +1104,7 @@ xfs_dir2_sf_to_block( ASSERT(ifp->if_bytes == dp->i_d.di_size); ASSERT(ifp->if_u1.if_data != NULL); ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); - ASSERT(dp->i_d.di_nextents == 0); + ASSERT(dp->i_df.if_nextents == 0); /* * Copy the directory into a temporary buffer. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 01ee0b926572..44c6a77cba05 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 7b7f6fb2ea3b..2463b5d73447 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -343,7 +343,7 @@ xfs_dir2_block_to_sf( */ ASSERT(dp->i_df.if_bytes == 0); xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size); - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + dp->i_df.if_format = XFS_DINODE_FMT_LOCAL; dp->i_d.di_size = size; logflags |= XFS_ILOG_DDATA; @@ -710,11 +710,11 @@ xfs_dir2_sf_verify( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; - struct xfs_ifork *ifp; xfs_ino_t ino; int i; int i8count; @@ -723,9 +723,8 @@ xfs_dir2_sf_verify( int error; uint8_t filetype; - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; size = ifp->if_bytes; @@ -827,9 +826,9 @@ xfs_dir2_sf_create( * If it's currently a zero-length extent file, * convert it to local format. */ - if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) { + if (dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS) { dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */ - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + dp->i_df.if_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); dp->i_df.if_flags |= XFS_IFINLINE; } @@ -1027,7 +1026,7 @@ xfs_dir2_sf_replace_needblock( int newsize; struct xfs_dir2_sf_hdr *sfp; - if (dp->i_d.di_format != XFS_DINODE_FMT_LOCAL) + if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL) return false; sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 79e6c4fb1d8a..53b305dea381 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (C) 2017 Oracle. @@ -55,7 +55,8 @@ #define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 #define XFS_ERRTAG_IUNLINK_FALLBACK 34 -#define XFS_ERRTAG_MAX 35 +#define XFS_ERRTAG_BUF_IOERROR 35 +#define XFS_ERRTAG_MAX 36 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -95,5 +96,6 @@ #define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 #define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 045556e78ee2..b42a52bfa1e9 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -964,13 +964,12 @@ enum xfs_dinode_fmt { /* * Inode data & attribute fork sizes, per inode. */ -#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) #define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp)) + ((dip)->di_forkoff ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp)) #define XFS_DFORK_ASIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0) + ((dip)->di_forkoff ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ XFS_DFORK_DSIZE(dip, mp) : \ @@ -1681,7 +1680,7 @@ struct xfs_acl_entry { struct xfs_acl { __be32 acl_cnt; - struct xfs_acl_entry acl_entry[0]; + struct xfs_acl_entry acl_entry[]; }; /* diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 245188e4f6d3..84bcffa87753 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: LGPL-2.1 +/* SPDX-License-Identifier: LGPL-2.1 */ /* * Copyright (c) 1995-2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 272005ac8c88..99e796256c5d 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2019 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 39c5a6e24915..6f84ea85fdd8 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -161,8 +161,7 @@ xfs_imap_to_bp( struct xfs_imap *imap, struct xfs_dinode **dipp, struct xfs_buf **bpp, - uint buf_flags, - uint iget_flags) + uint buf_flags) { struct xfs_buf *bp; int error; @@ -172,12 +171,7 @@ xfs_imap_to_bp( (int)imap->im_len, buf_flags, &bp, &xfs_inode_buf_ops); if (error) { - if (error == -EAGAIN) { - ASSERT(buf_flags & XBF_TRYLOCK); - return error; - } - xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); + ASSERT(error != -EAGAIN || (buf_flags & XBF_TRYLOCK)); return error; } @@ -186,13 +180,36 @@ xfs_imap_to_bp( return 0; } -void +int xfs_inode_from_disk( struct xfs_inode *ip, struct xfs_dinode *from) { struct xfs_icdinode *to = &ip->i_d; struct inode *inode = VFS_I(ip); + int error; + xfs_failaddr_t fa; + + ASSERT(ip->i_cowfp == NULL); + ASSERT(ip->i_afp == NULL); + + fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", from, + sizeof(*from), fa); + return -EFSCORRUPTED; + } + + /* + * First get the permanent information that is needed to allocate an + * inode. If the inode is unused, mode is zero and we shouldn't mess + * with the unitialized part of it. + */ + to->di_flushiter = be16_to_cpu(from->di_flushiter); + inode->i_generation = be32_to_cpu(from->di_gen); + inode->i_mode = be16_to_cpu(from->di_mode); + if (!inode->i_mode) + return 0; /* * Convert v1 inodes immediately to v2 inode format as this is the @@ -208,10 +225,8 @@ xfs_inode_from_disk( be16_to_cpu(from->di_projid_lo); } - to->di_format = from->di_format; i_uid_write(inode, be32_to_cpu(from->di_uid)); i_gid_write(inode, be32_to_cpu(from->di_gid)); - to->di_flushiter = be16_to_cpu(from->di_flushiter); /* * Time is signed, so need to convert to signed 32 bit before @@ -225,16 +240,11 @@ xfs_inode_from_disk( inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); - inode->i_generation = be32_to_cpu(from->di_gen); - inode->i_mode = be16_to_cpu(from->di_mode); to->di_size = be64_to_cpu(from->di_size); to->di_nblocks = be64_to_cpu(from->di_nblocks); to->di_extsize = be32_to_cpu(from->di_extsize); - to->di_nextents = be32_to_cpu(from->di_nextents); - to->di_anextents = be16_to_cpu(from->di_anextents); to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; to->di_dmevmask = be32_to_cpu(from->di_dmevmask); to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); @@ -247,6 +257,22 @@ xfs_inode_from_disk( to->di_flags2 = be64_to_cpu(from->di_flags2); to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } + + error = xfs_iformat_data_fork(ip, from); + if (error) + return error; + if (from->di_forkoff) { + error = xfs_iformat_attr_fork(ip, from); + if (error) + goto out_destroy_data_fork; + } + if (xfs_is_reflink_inode(ip)) + xfs_ifork_init_cow(ip); + return 0; + +out_destroy_data_fork: + xfs_idestroy_fork(&ip->i_df); + return error; } void @@ -261,7 +287,7 @@ xfs_inode_to_disk( to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); to->di_onlink = 0; - to->di_format = from->di_format; + to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = cpu_to_be32(i_uid_read(inode)); to->di_gid = cpu_to_be32(i_gid_read(inode)); to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff); @@ -281,10 +307,10 @@ xfs_inode_to_disk( to->di_size = cpu_to_be64(from->di_size); to->di_nblocks = cpu_to_be64(from->di_nblocks); to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; + to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_dmevmask = cpu_to_be32(from->di_dmevmask); to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); @@ -405,7 +431,7 @@ xfs_dinode_verify_forkoff( struct xfs_dinode *dip, struct xfs_mount *mp) { - if (!XFS_DFORK_Q(dip)) + if (!dip->di_forkoff) return NULL; switch (dip->di_format) { @@ -508,7 +534,7 @@ xfs_dinode_verify( return __this_address; } - if (XFS_DFORK_Q(dip)) { + if (dip->di_forkoff) { fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK); if (fa) return fa; @@ -585,122 +611,6 @@ xfs_dinode_calc_crc( } /* - * Read the disk inode attributes into the in-core inode structure. - * - * For version 5 superblocks, if we are initialising a new inode and we are not - * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new - * inode core with a random generation number. If we are keeping inodes around, - * we need to read the inode cluster to get the existing generation number off - * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode - * format) then log recovery is dependent on the di_flushiter field being - * initialised from the current on-disk value and hence we must also read the - * inode off disk. - */ -int -xfs_iread( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - uint iget_flags) -{ - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_failaddr_t fa; - int error; - - /* - * Fill in the location information in the in-core inode. - */ - error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); - if (error) - return error; - - /* shortcut IO on inode allocation if possible */ - if ((iget_flags & XFS_IGET_CREATE) && - xfs_sb_version_has_v3inode(&mp->m_sb) && - !(mp->m_flags & XFS_MOUNT_IKEEP)) { - VFS_I(ip)->i_generation = prandom_u32(); - return 0; - } - - /* - * Get pointers to the on-disk inode and the buffer containing it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); - if (error) - return error; - - /* even unallocated inodes are verified */ - fa = xfs_dinode_verify(mp, ip->i_ino, dip); - if (fa) { - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", dip, - sizeof(*dip), fa); - error = -EFSCORRUPTED; - goto out_brelse; - } - - /* - * If the on-disk inode is already linked to a directory - * entry, copy all of the inode into the in-core inode. - * xfs_iformat_fork() handles copying in the inode format - * specific information. - * Otherwise, just get the truly permanent information. - */ - if (dip->di_mode) { - xfs_inode_from_disk(ip, dip); - error = xfs_iformat_fork(ip, dip); - if (error) { -#ifdef DEBUG - xfs_alert(mp, "%s: xfs_iformat() returned error %d", - __func__, error); -#endif /* DEBUG */ - goto out_brelse; - } - } else { - /* - * Partial initialisation of the in-core inode. Just the bits - * that xfs_ialloc won't overwrite or relies on being correct. - */ - VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); - ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); - - /* - * Make sure to pull in the mode here as well in - * case the inode is released without being used. - * This ensures that xfs_inactive() will see that - * the inode is already free and not try to mess - * with the uninitialized part of it. - */ - VFS_I(ip)->i_mode = 0; - } - - ip->i_delayed_blks = 0; - - /* - * Mark the buffer containing the inode as something to keep - * around for a while. This helps to keep recently accessed - * meta-data in-core longer. - */ - xfs_buf_set_ref(bp, XFS_INO_REF); - - /* - * Use xfs_trans_brelse() to release the buffer containing the on-disk - * inode, because it was acquired with xfs_trans_read_buf() in - * xfs_imap_to_bp() above. If tp is NULL, this is just a normal - * brelse(). If we're within a transaction, then xfs_trans_brelse() - * will only release the buffer if it is not dirty within the - * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be locking the - * new in-core inode before putting it in the cache where other - * processes can find it. Thus we don't have to worry about the inode - * being changed just because we released the buffer. - */ - out_brelse: - xfs_trans_brelse(tp, bp); - return error; -} - -/* * Validate di_extsize hint. * * The rules are documented at xfs_ioctl_setattr_check_extsize(). diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 9b373dcf9e34..865ac493c72a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -16,16 +16,12 @@ struct xfs_dinode; * format specific structures at the appropriate time. */ struct xfs_icdinode { - int8_t di_format; /* format of di_c data */ uint16_t di_flushiter; /* incremented on flush */ uint32_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ uint16_t di_dmstate; /* DMIG state info */ uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ @@ -48,13 +44,11 @@ struct xfs_imap { int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_imap *, struct xfs_dinode **, - struct xfs_buf **, uint, uint); -int xfs_iread(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, uint); + struct xfs_buf **, uint); void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); -void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); +int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 518c6f0ec3a6..28b366275ae0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -26,110 +26,6 @@ kmem_zone_t *xfs_ifork_zone; -STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); -STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); -STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); - -/* - * Copy inode type and data and attr format specific information from the - * on-disk inode to the in-core inode and fork structures. For fifos, devices, - * and sockets this means set i_rdev to the proper value. For files, - * directories, and symlinks this means to bring in the in-line data or extent - * pointers as well as the attribute fork. For a fork in B-tree format, only - * the root is immediately brought in-core. The rest will be read in later when - * first referenced (see xfs_iread_extents()). - */ -int -xfs_iformat_fork( - struct xfs_inode *ip, - struct xfs_dinode *dip) -{ - struct inode *inode = VFS_I(ip); - struct xfs_attr_shortform *atp; - int size; - int error = 0; - xfs_fsize_t di_size; - - switch (inode->i_mode & S_IFMT) { - case S_IFIFO: - case S_IFCHR: - case S_IFBLK: - case S_IFSOCK: - ip->i_d.di_size = 0; - inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); - break; - - case S_IFREG: - case S_IFLNK: - case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: - di_size = be64_to_cpu(dip->di_size); - size = (int)di_size; - error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); - break; - default: - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, - dip, sizeof(*dip), __this_address); - return -EFSCORRUPTED; - } - break; - - default: - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, - sizeof(*dip), __this_address); - return -EFSCORRUPTED; - } - if (error) - return error; - - if (xfs_is_reflink_inode(ip)) { - ASSERT(ip->i_cowfp == NULL); - xfs_ifork_init_cow(ip); - } - - if (!XFS_DFORK_Q(dip)) - return 0; - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); - - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); - size = be16_to_cpu(atp->hdr.totsize); - - error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); - break; - default: - xfs_inode_verifier_error(ip, error, __func__, dip, - sizeof(*dip), __this_address); - error = -EFSCORRUPTED; - break; - } - if (error) { - kmem_cache_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - if (ip->i_cowfp) - kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); - ip->i_cowfp = NULL; - xfs_idestroy_fork(ip, XFS_DATA_FORK); - } - return error; -} - void xfs_init_local_fork( struct xfs_inode *ip, @@ -292,12 +188,11 @@ xfs_iformat_btree( * or the number of extents is greater than the number of * blocks. */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || + if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) || nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || + ifp->if_nextents > ip->i_d.di_nblocks) || level == 0 || level > XFS_BTREE_MAXLEVELS) { xfs_warn(mp, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); @@ -325,6 +220,110 @@ xfs_iformat_btree( return 0; } +int +xfs_iformat_data_fork( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct inode *inode = VFS_I(ip); + int error; + + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. + */ + ip->i_df.if_format = dip->di_format; + ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents); + + switch (inode->i_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_d.di_size = 0; + inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); + return 0; + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (ip->i_df.if_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, + be64_to_cpu(dip->di_size)); + if (!error) + error = xfs_ifork_verify_local_data(ip); + return error; + case XFS_DINODE_FMT_EXTENTS: + return xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + case XFS_DINODE_FMT_BTREE: + return xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); + return -EFSCORRUPTED; + } + break; + default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); + return -EFSCORRUPTED; + } +} + +static uint16_t +xfs_dfork_attr_shortform_size( + struct xfs_dinode *dip) +{ + struct xfs_attr_shortform *atp = + (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip); + + return be16_to_cpu(atp->hdr.totsize); +} + +int +xfs_iformat_attr_fork( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + int error = 0; + + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. + */ + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); + ip->i_afp->if_format = dip->di_aformat; + if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */ + ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; + ip->i_afp->if_nextents = be16_to_cpu(dip->di_anextents); + + switch (ip->i_afp->if_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, + xfs_dfork_attr_shortform_size(dip)); + if (!error) + error = xfs_ifork_verify_local_attr(ip); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + xfs_inode_verifier_error(ip, error, __func__, dip, + sizeof(*dip), __this_address); + error = -EFSCORRUPTED; + break; + } + + if (error) { + kmem_cache_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + } + return error; +} + /* * Reallocate the space for if_broot based on the number of records * being added or deleted as indicated in rec_diff. Move the records @@ -504,38 +503,24 @@ xfs_idata_realloc( void xfs_idestroy_fork( - xfs_inode_t *ip, - int whichfork) + struct xfs_ifork *ifp) { - struct xfs_ifork *ifp; - - ifp = XFS_IFORK_PTR(ip, whichfork); if (ifp->if_broot != NULL) { kmem_free(ifp->if_broot); ifp->if_broot = NULL; } /* - * If the format is local, then we can't have an extents - * array so just look for an inline data array. If we're - * not local then we may or may not have an extents list, - * so check and free it up if we do. + * If the format is local, then we can't have an extents array so just + * look for an inline data array. If we're not local then we may or may + * not have an extents list, so check and free it up if we do. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - if (ifp->if_u1.if_data != NULL) { - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; - } - } else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) { - xfs_iext_destroy(ifp); - } - - if (whichfork == XFS_ATTR_FORK) { - kmem_cache_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - } else if (whichfork == XFS_COW_FORK) { - kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); - ip->i_cowfp = NULL; + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = NULL; + } else if (ifp->if_flags & XFS_IFEXTENTS) { + if (ifp->if_height) + xfs_iext_destroy(ifp); } } @@ -592,7 +577,7 @@ void xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, - xfs_inode_log_item_t *iip, + struct xfs_inode_log_item *iip, int whichfork) { char *cp; @@ -618,7 +603,7 @@ xfs_iflush_fork( } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { @@ -633,7 +618,7 @@ xfs_iflush_fork( !(iip->ili_fields & extflag[whichfork])); if ((iip->ili_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + ASSERT(ifp->if_nextents > 0); (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, whichfork); } @@ -691,48 +676,55 @@ xfs_ifork_init_cow( ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); ip->i_cowfp->if_flags = XFS_IFEXTENTS; - ip->i_cformat = XFS_DINODE_FMT_EXTENTS; - ip->i_cnextents = 0; + ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } -/* Default fork content verifiers. */ -struct xfs_ifork_ops xfs_default_ifork_ops = { - .verify_attr = xfs_attr_shortform_verify, - .verify_dir = xfs_dir2_sf_verify, - .verify_symlink = xfs_symlink_shortform_verify, -}; - /* Verify the inline contents of the data fork of an inode. */ -xfs_failaddr_t -xfs_ifork_verify_data( - struct xfs_inode *ip, - struct xfs_ifork_ops *ops) +int +xfs_ifork_verify_local_data( + struct xfs_inode *ip) { - /* Non-local data fork, we're done. */ - if (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) - return NULL; + xfs_failaddr_t fa = NULL; - /* Check the inline data fork if there is one. */ switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFDIR: - return ops->verify_dir(ip); + fa = xfs_dir2_sf_verify(ip); + break; case S_IFLNK: - return ops->verify_symlink(ip); + fa = xfs_symlink_shortform_verify(ip); + break; default: - return NULL; + break; } + + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", + ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa); + return -EFSCORRUPTED; + } + + return 0; } /* Verify the inline contents of the attr fork of an inode. */ -xfs_failaddr_t -xfs_ifork_verify_attr( - struct xfs_inode *ip, - struct xfs_ifork_ops *ops) +int +xfs_ifork_verify_local_attr( + struct xfs_inode *ip) { - /* There has to be an attr fork allocated if aformat is local. */ - if (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) - return NULL; - if (!XFS_IFORK_PTR(ip, XFS_ATTR_FORK)) - return __this_address; - return ops->verify_attr(ip); + struct xfs_ifork *ifp = ip->i_afp; + xfs_failaddr_t fa; + + if (!ifp) + fa = __this_address; + else + fa = xfs_attr_shortform_verify(ip); + + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", + ifp ? ifp->if_u1.if_data : NULL, + ifp ? ifp->if_bytes : 0, fa); + return -EFSCORRUPTED; + } + + return 0; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 668ee942be22..a4953e95c4f3 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -23,6 +23,8 @@ struct xfs_ifork { } if_u1; short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ + int8_t if_format; /* format of this fork */ + xfs_extnum_t if_nextents; /* # of extents in this fork */ }; /* @@ -55,43 +57,36 @@ struct xfs_ifork { ((w) == XFS_ATTR_FORK ? \ XFS_IFORK_ASIZE(ip) : \ 0)) -#define XFS_IFORK_FORMAT(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_format : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_d.di_aformat : \ - (ip)->i_cformat)) -#define XFS_IFORK_FMT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_format = (n)) : \ - ((w) == XFS_ATTR_FORK ? \ - ((ip)->i_d.di_aformat = (n)) : \ - ((ip)->i_cformat = (n)))) -#define XFS_IFORK_NEXTENTS(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_nextents : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_d.di_anextents : \ - (ip)->i_cnextents)) -#define XFS_IFORK_NEXT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_nextents = (n)) : \ - ((w) == XFS_ATTR_FORK ? \ - ((ip)->i_d.di_anextents = (n)) : \ - ((ip)->i_cnextents = (n)))) #define XFS_IFORK_MAXEXT(ip, w) \ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) -#define xfs_ifork_has_extents(ip, w) \ - (XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_EXTENTS || \ - XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_BTREE) +static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp) +{ + return ifp->if_format == XFS_DINODE_FMT_EXTENTS || + ifp->if_format == XFS_DINODE_FMT_BTREE; +} + +static inline xfs_extnum_t xfs_ifork_nextents(struct xfs_ifork *ifp) +{ + if (!ifp) + return 0; + return ifp->if_nextents; +} + +static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) +{ + if (!ifp) + return XFS_DINODE_FMT_EXTENTS; + return ifp->if_format; +} struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); -int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *); +int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); -void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idestroy_fork(struct xfs_ifork *ifp); void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); void xfs_iroot_realloc(struct xfs_inode *, int, int); @@ -175,18 +170,7 @@ extern struct kmem_zone *xfs_ifork_zone; extern void xfs_ifork_init_cow(struct xfs_inode *ip); -typedef xfs_failaddr_t (*xfs_ifork_verifier_t)(struct xfs_inode *); - -struct xfs_ifork_ops { - xfs_ifork_verifier_t verify_symlink; - xfs_ifork_verifier_t verify_dir; - xfs_ifork_verifier_t verify_attr; -}; -extern struct xfs_ifork_ops xfs_default_ifork_ops; - -xfs_failaddr_t xfs_ifork_verify_data(struct xfs_inode *ip, - struct xfs_ifork_ops *ops); -xfs_failaddr_t xfs_ifork_verify_attr(struct xfs_inode *ip, - struct xfs_ifork_ops *ops); +int xfs_ifork_verify_local_data(struct xfs_inode *ip); +int xfs_ifork_verify_local_attr(struct xfs_inode *ip); #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 3bf671637a91..641132d0e39d 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -7,6 +7,73 @@ #define __XFS_LOG_RECOVER_H__ /* + * Each log item type (XFS_LI_*) gets its own xlog_recover_item_ops to + * define how recovery should work for that type of log item. + */ +struct xlog_recover_item; + +/* Sorting hat for log items as they're read in. */ +enum xlog_recover_reorder { + XLOG_REORDER_BUFFER_LIST, + XLOG_REORDER_ITEM_LIST, + XLOG_REORDER_INODE_BUFFER_LIST, + XLOG_REORDER_CANCEL_LIST, +}; + +struct xlog_recover_item_ops { + uint16_t item_type; /* XFS_LI_* type code. */ + + /* + * Help sort recovered log items into the order required to replay them + * correctly. Log item types that always use XLOG_REORDER_ITEM_LIST do + * not have to supply a function here. See the comment preceding + * xlog_recover_reorder_trans for more details about what the return + * values mean. + */ + enum xlog_recover_reorder (*reorder)(struct xlog_recover_item *item); + + /* Start readahead for pass2, if provided. */ + void (*ra_pass2)(struct xlog *log, struct xlog_recover_item *item); + + /* Do whatever work we need to do for pass1, if provided. */ + int (*commit_pass1)(struct xlog *log, struct xlog_recover_item *item); + + /* + * This function should do whatever work is needed for pass2 of log + * recovery, if provided. + * + * If the recovered item is an intent item, this function should parse + * the recovered item to construct an in-core log intent item and + * insert it into the AIL. The in-core log intent item should have 1 + * refcount so that the item is freed either (a) when we commit the + * recovered log item for the intent-done item; (b) replay the work and + * log a new intent-done item; or (c) recovery fails and we have to + * abort. + * + * If the recovered item is an intent-done item, this function should + * parse the recovered item to find the id of the corresponding intent + * log item. Next, it should find the in-core log intent item in the + * AIL and release it. + */ + int (*commit_pass2)(struct xlog *log, struct list_head *buffer_list, + struct xlog_recover_item *item, xfs_lsn_t lsn); +}; + +extern const struct xlog_recover_item_ops xlog_icreate_item_ops; +extern const struct xlog_recover_item_ops xlog_buf_item_ops; +extern const struct xlog_recover_item_ops xlog_inode_item_ops; +extern const struct xlog_recover_item_ops xlog_dquot_item_ops; +extern const struct xlog_recover_item_ops xlog_quotaoff_item_ops; +extern const struct xlog_recover_item_ops xlog_bui_item_ops; +extern const struct xlog_recover_item_ops xlog_bud_item_ops; +extern const struct xlog_recover_item_ops xlog_efi_item_ops; +extern const struct xlog_recover_item_ops xlog_efd_item_ops; +extern const struct xlog_recover_item_ops xlog_rui_item_ops; +extern const struct xlog_recover_item_ops xlog_rud_item_ops; +extern const struct xlog_recover_item_ops xlog_cui_item_ops; +extern const struct xlog_recover_item_ops xlog_cud_item_ops; + +/* * Macros, structures, prototypes for internal log manager use. */ @@ -22,13 +89,13 @@ /* * item headers are in ri_buf[0]. Additional buffers follow. */ -typedef struct xlog_recover_item { +struct xlog_recover_item { struct list_head ri_list; - int ri_type; int ri_cnt; /* count of regions found */ int ri_total; /* total regions */ - xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ -} xlog_recover_item_t; + struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */ + const struct xlog_recover_item_ops *ri_ops; +}; struct xlog_recover { struct hlist_node r_list; @@ -51,4 +118,12 @@ struct xlog_recover { #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 +void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len, + const struct xfs_buf_ops *ops); +bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); +void xlog_recover_iodone(struct xfs_buf *bp); + +void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, + uint64_t intent_id); + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index b2113b17e53c..56d9dd787e7b 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -100,7 +100,6 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ -#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index f42c74cb8be5..9498ced947be 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -66,7 +66,7 @@ xfs_rtbuf_get( ip = issum ? mp->m_rsumip : mp->m_rbmip; - error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index c526c5e5ab76..4df87546bd40 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -243,7 +243,7 @@ xfs_validate_sb_common( } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, -"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); +"Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits."); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 3b8260ca7d1b..594bc447a7dd 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -204,16 +204,12 @@ xfs_failaddr_t xfs_symlink_shortform_verify( struct xfs_inode *ip) { - char *sfp; - char *endp; - struct xfs_ifork *ifp; - int size; - - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - sfp = (char *)ifp->if_u1.if_data; - size = ifp->if_bytes; - endp = sfp + size; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + char *sfp = (char *)ifp->if_u1.if_data; + int size = ifp->if_bytes; + char *endp = sfp + size; + + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); /* * Zero length symlinks should never occur in memory as they are diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 2b8ccb5b975d..b5dfb6654842 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -27,7 +27,7 @@ xfs_trans_ijoin( struct xfs_inode *ip, uint lock_flags) { - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (ip->i_itemp == NULL) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index add8598eacd5..7badd6dfe544 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -566,8 +566,9 @@ xchk_bmap_check_rmaps( struct xfs_scrub *sc, int whichfork) { - loff_t size; + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); xfs_agnumber_t agno; + bool zero_size; int error; if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) || @@ -579,6 +580,8 @@ xchk_bmap_check_rmaps( if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) return 0; + ASSERT(XFS_IFORK_PTR(sc->ip, whichfork) != NULL); + /* * Only do this for complex maps that are in btree format, or for * situations where we would seem to have a size but zero extents. @@ -586,19 +589,14 @@ xchk_bmap_check_rmaps( * to flag this bmap as corrupt if there are rmaps that need to be * reattached. */ - switch (whichfork) { - case XFS_DATA_FORK: - size = i_size_read(VFS_I(sc->ip)); - break; - case XFS_ATTR_FORK: - size = XFS_IFORK_Q(sc->ip); - break; - default: - size = 0; - break; - } - if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE && - (size == 0 || XFS_IFORK_NEXTENTS(sc->ip, whichfork) > 0)) + + if (whichfork == XFS_DATA_FORK) + zero_size = i_size_read(VFS_I(sc->ip)) == 0; + else + zero_size = false; + + if (ifp->if_format != XFS_DINODE_FMT_BTREE && + (zero_size || ifp->if_nextents > 0)) return 0; for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) { @@ -627,12 +625,14 @@ xchk_bmap( struct xchk_bmap_info info = { NULL }; struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t endoff; struct xfs_iext_cursor icur; int error = 0; - ifp = XFS_IFORK_PTR(ip, whichfork); + /* Non-existent forks can be ignored. */ + if (!ifp) + goto out; info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); info.whichfork = whichfork; @@ -641,9 +641,6 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* Non-existent CoW forks are ignorable. */ - if (!ifp) - goto out; /* No CoW forks on non-reflink inodes/filesystems. */ if (!xfs_is_reflink_inode(ip)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -651,8 +648,6 @@ xchk_bmap( } break; case XFS_ATTR_FORK: - if (!ifp) - goto out_check_rmap; if (!xfs_sb_version_hasattr(&mp->m_sb) && !xfs_sb_version_hasattr2(&mp->m_sb)) xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -663,7 +658,7 @@ xchk_bmap( } /* Check the fork values */ - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_UUID: case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: @@ -717,7 +712,6 @@ xchk_bmap( goto out; } -out_check_rmap: error = xchk_bmap_check_rmaps(sc, whichfork); if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) goto out; diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 9a2e27ac1300..44b15015021f 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -468,7 +468,7 @@ xchk_da_btree( int error; /* Skip short format data structures; no btree to scan. */ - if (!xfs_ifork_has_extents(sc->ip, whichfork)) + if (!xfs_ifork_has_extents(XFS_IFORK_PTR(sc->ip, whichfork))) return 0; /* Set up initial da state. */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index fe2a6e030c8a..7c432997edad 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -635,7 +635,7 @@ xchk_directory_blocks( { struct xfs_bmbt_irec got; struct xfs_da_args args; - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; xfs_fileoff_t leaf_lblk; xfs_fileoff_t free_lblk; @@ -647,11 +647,10 @@ xchk_directory_blocks( int error; /* Ignore local format directories. */ - if (sc->ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - sc->ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS && + ifp->if_format != XFS_DINODE_FMT_BTREE) return 0; - ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET); leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET); free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 64c217eb06a7..6517d67e8d51 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -278,8 +278,7 @@ xchk_iallocbt_check_cluster( &XFS_RMAP_OINFO_INODES); /* Grab the inode cluster buffer. */ - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, - 0, 0); + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, 0); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) return error; diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 5705adc43a75..855aa8bcab64 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -90,7 +90,7 @@ xchk_parent_count_parent_dentries( * if there is one. */ lock_mode = xfs_ilock_data_map_shared(parent); - if (parent->i_d.di_nextents > 0) + if (parent->i_df.if_nextents > 0) error = xfs_dir3_data_readahead(parent, 0, 0); xfs_iunlock(parent, lock_mode); if (error) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 1fd4fb7a607c..b35611882ff9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -382,7 +382,7 @@ xfs_map_blocks( */ retry: xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); /* diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index c42f90e16b4f..bfad669e6b2f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -367,7 +367,7 @@ xfs_attr_inactive( * removal below. */ if (xfs_inode_hasattr(dp) && - dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + dp->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; @@ -388,8 +388,11 @@ out_cancel: xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ - if (dp->i_afp) - xfs_idestroy_fork(dp, XFS_ATTR_FORK); + if (dp->i_afp) { + xfs_idestroy_fork(dp->i_afp); + kmem_cache_free(xfs_ifork_zone, dp->i_afp); + dp->i_afp = NULL; + } if (lock_mode) xfs_iunlock(dp, lock_mode); return error; diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 5ff1d929d3b5..e380bd1a9bfc 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -512,9 +512,9 @@ xfs_attr_list_ilocked( */ if (!xfs_inode_hasattr(dp)) return 0; - else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_list(context); - else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) return xfs_attr_leaf_list(context); return xfs_attr_node_list(context); } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ee6f4229cebc..6736c5ab188f 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -22,16 +22,20 @@ #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; +static const struct xfs_item_ops xfs_bui_item_ops; + static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_bui_log_item, bui_item); } -void +STATIC void xfs_bui_item_free( struct xfs_bui_log_item *buip) { @@ -45,13 +49,13 @@ xfs_bui_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the BUI. */ -void +STATIC void xfs_bui_release( struct xfs_bui_log_item *buip) { ASSERT(atomic_read(&buip->bui_refcount) > 0); if (atomic_dec_and_test(&buip->bui_refcount)) { - xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); xfs_bui_item_free(buip); } } @@ -124,17 +128,10 @@ xfs_bui_item_release( xfs_bui_release(BUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_bui_item_ops = { - .iop_size = xfs_bui_item_size, - .iop_format = xfs_bui_item_format, - .iop_unpin = xfs_bui_item_unpin, - .iop_release = xfs_bui_item_release, -}; - /* * Allocate and initialize an bui item with the given number of extents. */ -struct xfs_bui_log_item * +STATIC struct xfs_bui_log_item * xfs_bui_init( struct xfs_mount *mp) @@ -278,27 +275,6 @@ xfs_bmap_update_diff_items( return ba->bi_owner->i_ino - bb->bi_owner->i_ino; } -/* Get an BUI. */ -STATIC void * -xfs_bmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_bui_log_item *buip; - - ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - ASSERT(tp != NULL); - - buip = xfs_bui_init(tp->t_mountp); - ASSERT(buip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &buip->bui_item); - return buip; -} - /* Set the map extent flags for this mapping. */ static void xfs_trans_set_bmap_flags( @@ -326,16 +302,12 @@ xfs_trans_set_bmap_flags( STATIC void xfs_bmap_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_bui_log_item *buip, + struct xfs_bmap_intent *bmap) { - struct xfs_bui_log_item *buip = intent; - struct xfs_bmap_intent *bmap; uint next_extent; struct xfs_map_extent *map; - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); @@ -355,23 +327,44 @@ xfs_bmap_update_log_item( bmap->bi_bmap.br_state); } +static struct xfs_log_item * +xfs_bmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_bui_log_item *buip = xfs_bui_init(mp); + struct xfs_bmap_intent *bmap; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + + xfs_trans_add_item(tp, &buip->bui_item); + if (sort) + list_sort(mp, items, xfs_bmap_update_diff_items); + list_for_each_entry(bmap, items, bi_list) + xfs_bmap_update_log_item(tp, buip, bmap); + return &buip->bui_item; +} + /* Get an BUD so we can process all the deferred rmap updates. */ -STATIC void * +static struct xfs_log_item * xfs_bmap_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_bud(tp, intent); + return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; } /* Process a deferred rmap update. */ STATIC int xfs_bmap_update_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_bmap_intent *bmap; xfs_filblks_t count; @@ -379,7 +372,7 @@ xfs_bmap_update_finish_item( bmap = container_of(item, struct xfs_bmap_intent, bi_list); count = bmap->bi_bmap.br_blockcount; - error = xfs_trans_log_finish_bmap_update(tp, done_item, + error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bmap->bi_type, bmap->bi_owner, bmap->bi_whichfork, bmap->bi_bmap.br_startoff, @@ -398,9 +391,9 @@ xfs_bmap_update_finish_item( /* Abort all pending BUIs. */ STATIC void xfs_bmap_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_bui_release(intent); + xfs_bui_release(BUI_ITEM(intent)); } /* Cancel a deferred rmap update. */ @@ -416,10 +409,8 @@ xfs_bmap_update_cancel_item( const struct xfs_defer_op_type xfs_bmap_update_defer_type = { .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .diff_items = xfs_bmap_update_diff_items, .create_intent = xfs_bmap_update_create_intent, .abort_intent = xfs_bmap_update_abort_intent, - .log_item = xfs_bmap_update_log_item, .create_done = xfs_bmap_update_create_done, .finish_item = xfs_bmap_update_finish_item, .cancel_item = xfs_bmap_update_cancel_item, @@ -429,32 +420,30 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = { * Process a bmap update intent item that was recovered from the log. * We need to update some inode's bmbt. */ -int -xfs_bui_recover( - struct xfs_trans *parent_tp, - struct xfs_bui_log_item *buip) +STATIC int +xfs_bui_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - int error = 0; - unsigned int bui_type; + struct xfs_bmbt_irec irec; + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + struct xfs_trans *tp; + struct xfs_inode *ip = NULL; + struct xfs_mount *mp = parent_tp->t_mountp; struct xfs_map_extent *bmap; + struct xfs_bud_log_item *budp; xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; xfs_filblks_t count; - bool op_ok; - struct xfs_bud_log_item *budp; + xfs_exntst_t state; enum xfs_bmap_intent_type type; + bool op_ok; + unsigned int bui_type; int whichfork; - xfs_exntst_t state; - struct xfs_trans *tp; - struct xfs_inode *ip = NULL; - struct xfs_bmbt_irec irec; - struct xfs_mount *mp = parent_tp->t_mountp; - - ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); + int error = 0; /* Only one mapping operation per BUI... */ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); return -EFSCORRUPTED; } @@ -488,7 +477,6 @@ xfs_bui_recover( * This will pull the BUI from the AIL and * free the memory associated with it. */ - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); return -EFSCORRUPTED; } @@ -546,7 +534,6 @@ xfs_bui_recover( xfs_bmap_unmap_extent(tp, ip, &irec); } - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_defer_move(parent_tp, tp); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -563,3 +550,121 @@ err_inode: } return error; } + +STATIC bool +xfs_bui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return BUI_ITEM(lip)->bui_format.bui_id == intent_id; +} + +static const struct xfs_item_ops xfs_bui_item_ops = { + .iop_size = xfs_bui_item_size, + .iop_format = xfs_bui_item_format, + .iop_unpin = xfs_bui_item_unpin, + .iop_release = xfs_bui_item_release, + .iop_recover = xfs_bui_item_recover, + .iop_match = xfs_bui_item_match, +}; + +/* + * Copy an BUI format buffer from the given buf, and into the destination + * BUI format structure. The BUI/BUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_bui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_bui_log_format *dst_bui_fmt) +{ + struct xfs_bui_log_format *src_bui_fmt; + uint len; + + src_bui_fmt = buf->i_addr; + len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + + if (buf->i_len == len) { + memcpy(dst_bui_fmt, src_bui_fmt, len); + return 0; + } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent bmap update + * item from the bui format structure which was logged on disk. + * It allocates an in-core bui, copies the extents from the format + * structure into it, and adds the bui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_bui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_bui_log_item *buip; + struct xfs_bui_log_format *bui_formatp; + + bui_formatp = item->ri_buf[0].i_addr; + + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + buip = xfs_bui_init(mp); + error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); + if (error) { + xfs_bui_item_free(buip); + return error; + } + atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn); + xfs_bui_release(buip); + return 0; +} + +const struct xlog_recover_item_ops xlog_bui_item_ops = { + .item_type = XFS_LI_BUI, + .commit_pass2 = xlog_recover_bui_commit_pass2, +}; + +/* + * This routine is called when an BUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding BUI if it + * was still in the log. To do this it searches the AIL for the BUI with an id + * equal to that in the BUD format structure. If we find it we drop the BUD + * reference, which removes the BUI from the AIL and frees it. + */ +STATIC int +xlog_recover_bud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_bud_log_format *bud_formatp; + + bud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_BUI, bud_formatp->bud_bui_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_bud_item_ops = { + .item_type = XFS_LI_BUD, + .commit_pass2 = xlog_recover_bud_commit_pass2, +}; diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index ad479cc73de8..b9be62f8bd52 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -33,11 +33,6 @@ struct kmem_zone; #define XFS_BUI_MAX_FAST_EXTENTS 1 /* - * Define BUI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_BUI_RECOVERED 1 - -/* * This is the "bmap update intent" log item. It is used to log the fact that * some reverse mappings need to change. It is used in conjunction with the * "bmap update done" log item described below. @@ -49,7 +44,6 @@ struct xfs_bui_log_item { struct xfs_log_item bui_item; atomic_t bui_refcount; atomic_t bui_next_extent; - unsigned long bui_flags; /* misc flags */ struct xfs_bui_log_format bui_format; }; @@ -74,9 +68,4 @@ struct xfs_bud_log_item { extern struct kmem_zone *xfs_bui_zone; extern struct kmem_zone *xfs_bud_zone; -struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); -void xfs_bui_item_free(struct xfs_bui_log_item *); -void xfs_bui_release(struct xfs_bui_log_item *); -int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip); - #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 4f800f7fe888..f37f5cc4b19f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -223,7 +223,7 @@ xfs_bmap_count_blocks( if (!ifp) return 0; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_BTREE: if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); @@ -449,7 +449,7 @@ xfs_getbmap( break; } - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: break; @@ -1210,17 +1210,26 @@ xfs_swap_extents_check_format( struct xfs_inode *ip, /* target inode */ struct xfs_inode *tip) /* tmp inode */ { + struct xfs_ifork *ifp = &ip->i_df; + struct xfs_ifork *tifp = &tip->i_df; + + /* User/group/project quota ids must match if quotas are enforced. */ + if (XFS_IS_QUOTA_ON(ip->i_mount) && + (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) || + !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) || + ip->i_d.di_projid != tip->i_d.di_projid)) + return -EINVAL; /* Should never get a local format */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || - tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + if (ifp->if_format == XFS_DINODE_FMT_LOCAL || + tifp->if_format == XFS_DINODE_FMT_LOCAL) return -EINVAL; /* * if the target inode has less extents that then temporary inode then * why did userspace call us? */ - if (ip->i_d.di_nextents < tip->i_d.di_nextents) + if (ifp->if_nextents < tifp->if_nextents) return -EINVAL; /* @@ -1235,20 +1244,18 @@ xfs_swap_extents_check_format( * form then we will end up with the target inode in the wrong format * as we already know there are less extents in the temp inode. */ - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && + tifp->if_format == XFS_DINODE_FMT_BTREE) return -EINVAL; /* Check temp in extent form to max in target */ - if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + if (tifp->if_format == XFS_DINODE_FMT_EXTENTS && + tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; /* Check target in extent form to max in temp */ - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; /* @@ -1260,22 +1267,20 @@ xfs_swap_extents_check_format( * (a common defrag case) which will occur when the temp inode is in * extent format... */ - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (tifp->if_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_Q(ip) && - XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) + XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip)) return -EINVAL; - if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; } /* Reciprocal target->temp btree format checks */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_Q(tip) && XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) return -EINVAL; - if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; } @@ -1427,15 +1432,15 @@ xfs_swap_extent_forks( /* * Count the number of extended attribute blocks */ - if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && - (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 && + ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } - if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && - (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 && + tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk, &taforkblks); if (error) @@ -1450,9 +1455,9 @@ xfs_swap_extent_forks( * bmbt scan as the last step. */ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) (*target_log_flags) |= XFS_ILOG_DOWNER; - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE) (*src_log_flags) |= XFS_ILOG_DOWNER; } @@ -1468,9 +1473,6 @@ xfs_swap_extent_forks( ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; - swap(ip->i_d.di_nextents, tip->i_d.di_nextents); - swap(ip->i_d.di_format, tip->i_d.di_format); - /* * The extents in the source inode could still contain speculative * preallocation beyond EOF (e.g. the file is open but not modified @@ -1484,7 +1486,7 @@ xfs_swap_extent_forks( tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: (*src_log_flags) |= XFS_ILOG_DEXT; break; @@ -1495,7 +1497,7 @@ xfs_swap_extent_forks( break; } - switch (tip->i_d.di_format) { + switch (tip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: (*target_log_flags) |= XFS_ILOG_DEXT; break; @@ -1606,7 +1608,7 @@ xfs_swap_extents( if (xfs_inode_has_cow_data(tip)) { error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true); if (error) - return error; + goto out_unlock; } /* @@ -1615,9 +1617,9 @@ xfs_swap_extents( * performed with log redo items! */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - int w = XFS_DATA_FORK; - uint32_t ipnext = XFS_IFORK_NEXTENTS(ip, w); - uint32_t tipnext = XFS_IFORK_NEXTENTS(tip, w); + int w = XFS_DATA_FORK; + uint32_t ipnext = ip->i_df.if_nextents; + uint32_t tipnext = tip->i_df.if_nextents; /* * Conceptually this shouldn't affect the shape of either bmbt, @@ -1717,10 +1719,11 @@ xfs_swap_extents( /* Swap the cow forks. */ if (xfs_sb_version_hasreflink(&mp->m_sb)) { - ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS); - ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS); + ASSERT(!ip->i_cowfp || + ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS); + ASSERT(!tip->i_cowfp || + tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS); - swap(ip->i_cnextents, tip->i_cnextents); swap(ip->i_cowfp, tip->i_cowfp); if (ip->i_cowfp && ip->i_cowfp->if_bytes) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 65538d18e64f..20b748f7e186 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1197,8 +1197,10 @@ xfs_buf_ioend( bp->b_ops->verify_read(bp); } - if (!bp->b_error) + if (!bp->b_error) { + bp->b_flags &= ~XBF_WRITE_FAIL; bp->b_flags |= XBF_DONE; + } if (bp->b_iodone) (*(bp->b_iodone))(bp); @@ -1242,10 +1244,26 @@ xfs_buf_ioerror_alert( struct xfs_buf *bp, xfs_failaddr_t func) { - xfs_alert_ratelimited(bp->b_mount, -"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", - func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, - -bp->b_error); + xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", + "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", + func, (uint64_t)XFS_BUF_ADDR(bp), + bp->b_length, -bp->b_error); +} + +/* + * To simulate an I/O failure, the buffer must be locked and held with at least + * three references. The LRU reference is dropped by the stale call. The buf + * item reference is dropped via ioend processing. The third reference is owned + * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. + */ +void +xfs_buf_ioend_fail( + struct xfs_buf *bp) +{ + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_ioend(bp); } int @@ -1258,7 +1276,7 @@ xfs_bwrite( bp->b_flags |= XBF_WRITE; bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | - XBF_WRITE_FAIL | XBF_DONE); + XBF_DONE); error = xfs_buf_submit(bp); if (error) @@ -1272,6 +1290,11 @@ xfs_buf_bio_end_io( { struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; + if (!bio->bi_status && + (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && + XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + bio->bi_status = BLK_STS_IOERR; + /* * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. @@ -1480,10 +1503,7 @@ __xfs_buf_submit( /* on shutdown we stale and complete the buffer immediately */ if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buf_ioerror(bp, -EIO); - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioend(bp); + xfs_buf_ioend_fail(bp); return -EIO; } @@ -1642,7 +1662,8 @@ xfs_wait_buftarg( struct xfs_buftarg *btp) { LIST_HEAD(dispose); - int loop = 0; + int loop = 0; + bool write_fail = false; /* * First wait on the buftarg I/O count for all in-flight buffers to be @@ -1670,17 +1691,29 @@ xfs_wait_buftarg( bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); if (bp->b_flags & XBF_WRITE_FAIL) { - xfs_alert(btp->bt_mount, + write_fail = true; + xfs_buf_alert_ratelimited(bp, + "XFS: Corruption Alert", "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", (long long)bp->b_bn); - xfs_alert(btp->bt_mount, -"Please run xfs_repair to determine the extent of the problem."); } xfs_buf_rele(bp); } if (loop++ != 0) delay(100); } + + /* + * If one or more failed buffers were freed, that means dirty metadata + * was thrown away. This should only ever happen after I/O completion + * handling has elevated I/O error(s) to permanent failures and shuts + * down the fs. + */ + if (write_fail) { + ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount)); + xfs_alert(btp->bt_mount, + "Please run xfs_repair to determine the extent of the problem."); + } } static enum lru_status @@ -1813,6 +1846,13 @@ xfs_alloc_buftarg( btp->bt_bdev = bdev; btp->bt_daxdev = dax_dev; + /* + * Buffer IO error rate limiting. Limit it to no more than 10 messages + * per 30 seconds so as to not spam logs too much on repeated errors. + */ + ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, + DEFAULT_RATELIMIT_BURST); + if (xfs_setsize_buftarg_early(btp, bdev)) goto error_free; @@ -1983,7 +2023,7 @@ xfs_buf_delwri_submit_buffers( * synchronously. Otherwise, drop the buffer from the delwri * queue and submit async. */ - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); + bp->b_flags &= ~_XBF_DELWRI_Q; bp->b_flags |= XBF_WRITE; if (wait_list) { bp->b_flags &= ~XBF_ASYNC; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 9a04c53c2488..050c53b739e2 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -91,6 +91,7 @@ typedef struct xfs_buftarg { struct list_lru bt_lru; struct percpu_counter bt_io_count; + struct ratelimit_state bt_ioerror_rl; } xfs_buftarg_t; struct xfs_buf; @@ -263,6 +264,7 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); +void xfs_buf_ioend_fail(struct xfs_buf *); extern int __xfs_buf_submit(struct xfs_buf *bp, bool); static inline int xfs_buf_submit(struct xfs_buf *bp) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1545657c3ca0..9e75e8d6042e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -410,7 +410,6 @@ xfs_buf_item_unpin( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); xfs_buf_t *bp = bip->bli_buf; - struct xfs_ail *ailp = lip->li_ailp; int stale = bip->bli_flags & XFS_BLI_STALE; int freed; @@ -452,10 +451,10 @@ xfs_buf_item_unpin( } /* - * If we get called here because of an IO error, we may - * or may not have the item on the AIL. xfs_trans_ail_delete() - * will take care of that situation. - * xfs_trans_ail_delete() drops the AIL lock. + * If we get called here because of an IO error, we may or may + * not have the item on the AIL. xfs_trans_ail_delete() will + * take care of that situation. xfs_trans_ail_delete() drops + * the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_do_callbacks(bp); @@ -463,47 +462,23 @@ xfs_buf_item_unpin( list_del_init(&bp->b_li_list); bp->b_iodone = NULL; } else { - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); } else if (freed && remove) { /* - * There are currently two references to the buffer - the active - * LRU reference and the buf log item. What we are about to do - * here - simulate a failed IO completion - requires 3 - * references. - * - * The LRU reference is removed by the xfs_buf_stale() call. The - * buf item reference is removed by the xfs_buf_iodone() - * callback that is run by xfs_buf_do_callbacks() during ioend - * processing (via the bp->b_iodone callback), and then finally - * the ioend processing will drop the IO reference if the buffer - * is marked XBF_ASYNC. - * - * Hence we need to take an additional reference here so that IO - * completion processing doesn't free the buffer prematurely. + * The buffer must be locked and held by the caller to simulate + * an async I/O failure. */ xfs_buf_lock(bp); xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; - xfs_buf_ioerror(bp, -EIO); - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioend(bp); + xfs_buf_ioend_fail(bp); } } -/* - * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 - * seconds so as to not spam logs too much on repeated detection of the same - * buffer being bad.. - */ - -static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); - STATIC uint xfs_buf_item_push( struct xfs_log_item *lip, @@ -533,11 +508,10 @@ xfs_buf_item_push( trace_xfs_buf_item_push(bip); /* has a previous flush failed due to IO errors? */ - if ((bp->b_flags & XBF_WRITE_FAIL) && - ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { - xfs_warn(bp->b_mount, -"Failing async write on buffer block 0x%llx. Retrying async write.", - (long long)bp->b_bn); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", + "Failing async write on buffer block 0x%llx. Retrying async write.", + (long long)bp->b_bn); } if (!xfs_buf_delwri_queue(bp, buffer_list)) @@ -584,7 +558,7 @@ xfs_buf_item_put( * state. */ if (aborted) - xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(lip, 0); xfs_buf_item_relse(bip->bli_buf); return true; } @@ -1229,61 +1203,19 @@ xfs_buf_iodone( struct xfs_buf *bp, struct xfs_log_item *lip) { - struct xfs_ail *ailp = lip->li_ailp; - ASSERT(BUF_ITEM(lip)->bli_buf == bp); xfs_buf_rele(bp); /* - * If we are forcibly shutting down, this may well be - * off the AIL already. That's because we simulate the - * log-committed callbacks to unpin these buffers. Or we may never - * have put this item on AIL because of the transaction was - * aborted forcibly. xfs_trans_ail_delete() takes care of these. + * If we are forcibly shutting down, this may well be off the AIL + * already. That's because we simulate the log-committed callbacks to + * unpin these buffers. Or we may never have put this item on AIL + * because of the transaction was aborted forcibly. + * xfs_trans_ail_delete() takes care of these. * * Either way, AIL is useless if we're forcing a shutdown. */ - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + xfs_trans_ail_delete(lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } - -/* - * Requeue a failed buffer for writeback. - * - * We clear the log item failed state here as well, but we have to be careful - * about reference counts because the only active reference counts on the buffer - * may be the failed log items. Hence if we clear the log item failed state - * before queuing the buffer for IO we can release all active references to - * the buffer and free it, leading to use after free problems in - * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which - * order we process them in - the buffer is locked, and we own the buffer list - * so nothing on them is going to change while we are performing this action. - * - * Hence we can safely queue the buffer for IO before we clear the failed log - * item state, therefore always having an active reference to the buffer and - * avoiding the transient zero-reference state that leads to use-after-free. - * - * Return true if the buffer was added to the buffer list, false if it was - * already on the buffer list. - */ -bool -xfs_buf_resubmit_failed_buffers( - struct xfs_buf *bp, - struct list_head *buffer_list) -{ - struct xfs_log_item *lip; - bool ret; - - ret = xfs_buf_delwri_queue(bp, buffer_list); - - /* - * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this - * function already have it acquired - */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) - xfs_clear_li_failed(lip); - - return ret; -} diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 30114b510332..c9c57e2da932 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -59,8 +59,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *, struct xfs_log_item *); void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); -bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, - struct list_head *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c new file mode 100644 index 000000000000..04faa7310c4f --- /dev/null +++ b/fs/xfs/xfs_buf_item_recover.c @@ -0,0 +1,984 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_error.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_quota.h" + +/* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; + +static struct xfs_buf_cancel * +xlog_find_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + struct list_head *bucket; + struct xfs_buf_cancel *bcp; + + if (!log->l_buf_cancel_table) + return NULL; + + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + return bcp; + } + + return NULL; +} + +static bool +xlog_add_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + struct xfs_buf_cancel *bcp; + + /* + * If we find an existing cancel record, this indicates that the buffer + * was cancelled multiple times. To ensure that during pass 2 we keep + * the record in the table until we reach its last occurrence in the + * log, a reference count is kept to tell how many times we expect to + * see this record during the second pass. + */ + bcp = xlog_find_buffer_cancelled(log, blkno, len); + if (bcp) { + bcp->bc_refcount++; + return false; + } + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); + bcp->bc_blkno = blkno; + bcp->bc_len = len; + bcp->bc_refcount = 1; + list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno)); + return true; +} + +/* + * Check if there is and entry for blkno, len in the buffer cancel record table. + */ +bool +xlog_is_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + return xlog_find_buffer_cancelled(log, blkno, len) != NULL; +} + +/* + * Check if there is and entry for blkno, len in the buffer cancel record table, + * and decremented the reference count on it if there is one. + * + * Remove the cancel record once the refcount hits zero, so that if the same + * buffer is re-used again after its last cancellation we actually replay the + * changes made at that point. + */ +static bool +xlog_put_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + struct xfs_buf_cancel *bcp; + + bcp = xlog_find_buffer_cancelled(log, blkno, len); + if (!bcp) { + ASSERT(0); + return false; + } + + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } + return true; +} + +/* log buffer item recovery */ + +/* + * Sort buffer items for log recovery. Most buffer items should end up on the + * buffer list and are recovered first, with the following exceptions: + * + * 1. XFS_BLF_CANCEL buffers must be processed last because some log items + * might depend on the incor ecancellation record, and replaying a cancelled + * buffer item can remove the incore record. + * + * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that + * we replay di_next_unlinked only after flushing the inode 'free' state + * to the inode buffer. + * + * See xlog_recover_reorder_trans for more details. + */ +STATIC enum xlog_recover_reorder +xlog_recover_buf_reorder( + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + + if (buf_f->blf_flags & XFS_BLF_CANCEL) + return XLOG_REORDER_CANCEL_LIST; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + return XLOG_REORDER_INODE_BUFFER_LIST; + return XLOG_REORDER_BUFFER_LIST; +} + +STATIC void +xlog_recover_buf_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + + xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); +} + +/* + * Build up the table of buf cancel records so that we don't replay cancelled + * data in the second pass. + */ +static int +xlog_recover_buf_commit_pass1( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; + + if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { + xfs_err(log->l_mp, "bad buffer log item size (%d)", + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + if (!(bf->blf_flags & XFS_BLF_CANCEL)) + trace_xfs_log_recover_buf_not_cancel(log, bf); + else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len)) + trace_xfs_log_recover_buf_cancel_add(log, bf); + else + trace_xfs_log_recover_buf_cancel_ref_inc(log, bf); + return 0; +} + +/* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recover_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + uint32_t magic32; + uint16_t magic16; + uint16_t magicda; + char *warnmsg = NULL; + + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTB_MAGIC: + bp->b_ops = &xfs_bnobt_buf_ops; + break; + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_cntbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_finobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + case XFS_RMAP_CRC_MAGIC: + bp->b_ops = &xfs_rmapbt_buf_ops; + break; + case XFS_REFC_CRC_MAGIC: + bp->b_ops = &xfs_refcountbt_buf_ops; + break; + default: + warnmsg = "Bad btree block magic!"; + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + warnmsg = "Bad AGF block magic!"; + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (magic32 != XFS_AGFL_MAGIC) { + warnmsg = "Bad AGFL block magic!"; + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + warnmsg = "Bad AGI block magic!"; + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + warnmsg = "Bad DQUOT block magic!"; + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + if (magic16 != XFS_DINODE_MAGIC) { + warnmsg = "Bad INODE block magic!"; + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + warnmsg = "Bad symlink block magic!"; + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + warnmsg = "Bad dir block magic!"; + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + warnmsg = "Bad dir data magic!"; + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + warnmsg = "Bad dir3 free magic!"; + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + warnmsg = "Bad dir leaf1 magic!"; + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + warnmsg = "Bad dir leafn magic!"; + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + warnmsg = "Bad da node magic!"; + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + warnmsg = "Bad attr leaf magic!"; + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + warnmsg = "Bad attr remote magic!"; + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + warnmsg = "Bad SB block magic!"; + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; +#ifdef CONFIG_XFS_RT + case XFS_BLFT_RTBITMAP_BUF: + case XFS_BLFT_RTSUMMARY_BUF: + /* no magic numbers for verification of RT buffers */ + bp->b_ops = &xfs_rtbuf_ops; + break; +#endif /* CONFIG_XFS_RT */ + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } + + /* + * Nothing else to do in the case of a NULL current LSN as this means + * the buffer is more recent than the change in the log and will be + * skipped. + */ + if (current_lsn == NULLCOMMITLSN) + return; + + if (warnmsg) { + xfs_warn(mp, warnmsg); + ASSERT(0); + } + + /* + * We must update the metadata LSN of the buffer as it is written out to + * ensure that older transactions never replay over this one and corrupt + * the buffer. This can occur if log recovery is interrupted at some + * point after the current transaction completes, at which point a + * subsequent mount starts recovery from the beginning. + * + * Write verifiers update the metadata LSN from log items attached to + * the buffer. Therefore, initialize a bli purely to carry the LSN to + * the verifier. We'll clean it up in our ->iodone() callback. + */ + if (bp->b_ops) { + struct xfs_buf_log_item *bip; + + ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_item_init(bp, mp); + bip = bp->b_log_item; + bip->bli_item.li_lsn = current_lsn; + } +} + +/* + * Perform a 'normal' buffer recovery. Each logged region of the + * buffer should be copied over the corresponding region in the + * given buffer. The bitmap in the buf log format structure indicates + * where to place the logged data. + */ +STATIC void +xlog_recover_do_reg_buffer( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + int i; + int bit; + int nbits; + xfs_failaddr_t fa; + const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); + + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + + bit = 0; + i = 1; /* 0 is the buf format structure */ + while (1) { + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + if (bit == -1) + break; + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + ASSERT(item->ri_buf[i].i_addr != NULL); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(BBTOB(bp->b_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + + /* + * Do a sanity check if this is a dquot buffer. Just checking + * the first dquot in the buffer should do. XXXThis is + * probably a good thing to do for other buf types also. + */ + fa = NULL; + if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + if (item->ri_buf[i].i_addr == NULL) { + xfs_alert(mp, + "XFS: NULL dquot in %s.", __func__); + goto next; + } + if (item->ri_buf[i].i_len < size_disk_dquot) { + xfs_alert(mp, + "XFS: dquot too small (%d) in %s.", + item->ri_buf[i].i_len, __func__); + goto next; + } + fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, + -1, 0); + if (fa) { + xfs_alert(mp, + "dquot corrupt at %pS trying to replay into block 0x%llx", + fa, bp->b_bn); + goto next; + } + } + + memcpy(xfs_buf_offset(bp, + (uint)bit << XFS_BLF_SHIFT), /* dest */ + item->ri_buf[i].i_addr, /* source */ + nbits<<XFS_BLF_SHIFT); /* length */ + next: + i++; + bit += nbits; + } + + /* Shouldn't be any more regions */ + ASSERT(i == item->ri_total); + + xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); +} + +/* + * Perform a dquot buffer recovery. + * Simple algorithm: if we have found a QUOTAOFF log item of the same type + * (ie. USR or GRP), then just toss this buffer away; don't recover it. + * Else, treat it as a regular buffer and do recovery. + * + * Return false if the buffer was tossed and true if we recovered the buffer to + * indicate to the caller if the buffer needs writing. + */ +STATIC bool +xlog_recover_do_dquot_buffer( + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) +{ + uint type; + + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (!mp->m_qflags) + return false; + + type = 0; + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) + type |= XFS_DQ_USER; + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) + type |= XFS_DQ_PROJ; + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) + type |= XFS_DQ_GROUP; + /* + * This type of quotas was turned off, so ignore this buffer + */ + if (log->l_quotaoffs_flag & type) + return false; + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); + return true; +} + +/* + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). + * + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. + */ +STATIC int +xlog_recover_do_inode_buffer( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) +{ + int i; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; + int next_unlinked_offset; + int inodes_per_buf; + xfs_agino_t *logged_nextp; + xfs_agino_t *buffer_nextp; + + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + + inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; + for (i = 0; i < inodes_per_buf; i++) { + next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + + offsetof(xfs_dinode_t, di_next_unlinked); + + while (next_unlinked_offset >= + (reg_buf_offset + reg_buf_bytes)) { + /* + * The next di_next_unlinked field is beyond + * the current logged region. Find the next + * logged region that contains or is beyond + * the current di_next_unlinked field. + */ + bit += nbits; + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + + /* + * If there are no more logged regions in the + * buffer, then we're done. + */ + if (bit == -1) + return 0; + + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; + item_index++; + } + + /* + * If the current logged region starts after the current + * di_next_unlinked field, then move on to the next + * di_next_unlinked field. + */ + if (next_unlinked_offset < reg_buf_offset) + continue; + + ASSERT(item->ri_buf[item_index].i_addr != NULL); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); + + /* + * The current logged region contains a copy of the + * current di_next_unlinked field. Extract its value + * and copy it to the buffer copy. + */ + logged_nextp = item->ri_buf[item_index].i_addr + + next_unlinked_offset - reg_buf_offset; + if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { + xfs_alert(mp, + "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " + "Trying to replay bad (0) inode di_next_unlinked field.", + item, bp); + return -EFSCORRUPTED; + } + + buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); + *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + + } + + return 0; +} + +/* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + uint32_t magic32; + uint16_t magic16; + uint16_t magicda; + void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_RMAP_CRC_MAGIC: + case XFS_REFC_CRC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } + case XFS_AGF_MAGIC: + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; + case XFS_AGFL_MAGIC: + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; + case XFS_AGI_MAGIC: + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; + case XFS_SYMLINK_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; + case XFS_ATTR3_RMT_MAGIC: + /* + * Remote attr blocks are written synchronously, rather than + * being logged. That means they do not contain a valid LSN + * (i.e. transactionally ordered) in them, and hence any time we + * see a buffer to replay over the top of a remote attribute + * block we should simply do so. + */ + goto recover_immediately; + case XFS_SB_MAGIC: + /* + * superblock uuids are magic. We may or may not have a + * sb_meta_uuid on disk, but it will be set in the in-core + * superblock. We set the uuid pointer for verification + * according to the superblock feature mask to ensure we check + * the relevant UUID in the superblock. + */ + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + if (xfs_sb_version_hasmetauuid(&mp->m_sb)) + uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; + else + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* + * This routine replays a modification made to a buffer at runtime. + * There are actually two types of buffer, regular and inode, which + * are handled differently. Inode buffers are handled differently + * in that we only recover a specific set of data from them, namely + * the inode di_next_unlinked fields. This is because all other inode + * data is actually logged via inode records and any data we replay + * here which overlaps that may be stale. + * + * When meta-data buffers are freed at run time we log a buffer item + * with the XFS_BLF_CANCEL bit set to indicate that previous copies + * of the buffer in the log should not be replayed at recovery time. + * This is so that if the blocks covered by the buffer are reused for + * file data before we crash we don't end up replaying old, freed + * meta-data into a user's file. + * + * To handle the cancellation of buffer log items, we make two passes + * over the log during recovery. During the first we build a table of + * those buffers which have been cancelled, and during the second we + * only replay those buffers which do not have corresponding cancel + * records in the table. See xlog_recover_buf_pass[1,2] above + * for more details on the implementation of the table of cancel records. + */ +STATIC int +xlog_recover_buf_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp; + int error; + uint buf_flags; + xfs_lsn_t lsn; + + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (buf_f->blf_flags & XFS_BLF_CANCEL) { + if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len)) + goto cancelled; + } else { + + if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len)) + goto cancelled; + } + + trace_xfs_log_recover_buf_recover(log, buf_f); + + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; + + error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, &bp, NULL); + if (error) + return error; + + /* + * Recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + * + * Note that we have to be extremely careful of readahead here. + * Readahead does not attach verfiers to the buffers so if we don't + * actually do any replay after readahead because of the LSN we found + * in the buffer if more recent than that current transaction then we + * need to attach the verifier directly. Failure to do so can lead to + * future recovery actions (e.g. EFI and unlinked list recovery) can + * operate on the buffers and they won't get the verifier attached. This + * can lead to blocks on disk having the correct content but a stale + * CRC. + * + * It is safe to assume these clean buffers are currently up to date. + * If the buffer is dirtied by a later transaction being replayed, then + * the verifier will be reset to match whatever recover turns that + * buffer into. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_buf_skip(log, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + goto out_release; + } + + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + if (error) + goto out_release; + } else if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + bool dirty; + + dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + if (!dirty) + goto out_release; + } else { + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + } + + /* + * Perform delayed write on the buffer. Asynchronous writes will be + * slower when taking into account all the buffers to be flushed. + * + * Also make sure that only inode buffers with good sizes stay in + * the buffer cache. The kernel moves inodes in buffers of 1 block + * or inode_cluster_size bytes, whichever is bigger. The inode + * buffers in the log can be a different size if the log was generated + * by an older kernel using unclustered inode buffers or a newer kernel + * running with a different inode cluster size. Regardless, if the + * the inode buffer size isn't max(blocksize, inode_cluster_size) + * for *our* value of inode_cluster_size, then we need to keep + * the buffer out of the buffer cache so that the buffer won't + * overlap with future reads of those inodes. + */ + if (XFS_DINODE_MAGIC == + be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && + (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { + xfs_buf_stale(bp); + error = xfs_bwrite(bp); + } else { + ASSERT(bp->b_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + } + +out_release: + xfs_buf_relse(bp); + return error; +cancelled: + trace_xfs_log_recover_buf_cancel(log, buf_f); + return 0; +} + +const struct xlog_recover_item_ops xlog_buf_item_ops = { + .item_type = XFS_LI_BUF, + .reorder = xlog_recover_buf_reorder, + .ra_pass2 = xlog_recover_buf_ra_pass2, + .commit_pass1 = xlog_recover_buf_commit_pass1, + .commit_pass2 = xlog_recover_buf_commit_pass2, +}; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 871ec22c9aee..66deddd5e296 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -524,7 +524,7 @@ xfs_readdir( args.geo = dp->i_mount->m_dir_geo; args.trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) ; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index af2c8e5ceea0..d5b7f03e93c8 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -75,7 +75,7 @@ xfs_qm_adjust_dqlimits( int prealloc = 0; ASSERT(d->d_id); - defq = xfs_get_defquota(dq, q); + defq = xfs_get_defquota(q, xfs_dquot_type(dq)); if (defq->bsoftlimit && !d->d_blk_softlimit) { d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit); @@ -114,9 +114,14 @@ xfs_qm_adjust_dqlimits( void xfs_qm_adjust_dqtimers( struct xfs_mount *mp, - struct xfs_disk_dquot *d) + struct xfs_dquot *dq) { + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_disk_dquot *d = &dq->q_core; + struct xfs_def_quota *defq; + ASSERT(d->d_id); + defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); #ifdef DEBUG if (d->d_blk_hardlimit) @@ -138,7 +143,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + - mp->m_quotainfo->qi_btimelimit); + defq->btimelimit); } else { d->d_bwarns = 0; } @@ -161,7 +166,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + - mp->m_quotainfo->qi_itimelimit); + defq->itimelimit); } else { d->d_iwarns = 0; } @@ -184,7 +189,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + - mp->m_quotainfo->qi_rtbtimelimit); + defq->rtbtimelimit); } else { d->d_rtbwarns = 0; } @@ -205,16 +210,18 @@ xfs_qm_adjust_dqtimers( */ STATIC void xfs_qm_init_dquot_blk( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - xfs_buf_t *bp) + struct xfs_trans *tp, + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct xfs_buf *bp) { struct xfs_quotainfo *q = mp->m_quotainfo; - xfs_dqblk_t *d; - xfs_dqid_t curid; - int i; + struct xfs_dqblk *d; + xfs_dqid_t curid; + unsigned int qflag; + unsigned int blftype; + int i; ASSERT(tp); ASSERT(xfs_buf_islocked(bp)); @@ -238,11 +245,39 @@ xfs_qm_init_dquot_blk( } } - xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : - XFS_BLF_GDQUOT_BUF))); - xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); + if (type & XFS_DQ_USER) { + qflag = XFS_UQUOTA_CHKD; + blftype = XFS_BLF_UDQUOT_BUF; + } else if (type & XFS_DQ_PROJ) { + qflag = XFS_PQUOTA_CHKD; + blftype = XFS_BLF_PDQUOT_BUF; + } else { + qflag = XFS_GQUOTA_CHKD; + blftype = XFS_BLF_GDQUOT_BUF; + } + + xfs_trans_dquot_buf(tp, bp, blftype); + + /* + * quotacheck uses delayed writes to update all the dquots on disk in an + * efficient manner instead of logging the individual dquot changes as + * they are made. However if we log the buffer allocated here and crash + * after quotacheck while the logged initialisation is still in the + * active region of the log, log recovery can replay the dquot buffer + * initialisation over the top of the checked dquots and corrupt quota + * accounting. + * + * To avoid this problem, quotacheck cannot log the initialised buffer. + * We must still dirty the buffer and write it back before the + * allocation transaction clears the log. Therefore, mark the buffer as + * ordered instead of logging it directly. This is safe for quotacheck + * because it detects and repairs allocated but initialized dquot blocks + * in the quota inodes. + */ + if (!(mp->m_qflags & qflag)) + xfs_trans_ordered_buf(tp, bp); + else + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } /* @@ -1021,6 +1056,7 @@ xfs_qm_dqflush_done( struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; struct xfs_dquot *dqp = qip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; + xfs_lsn_t tail_lsn; /* * We only want to pull the item from the AIL if its @@ -1034,10 +1070,11 @@ xfs_qm_dqflush_done( ((lip->li_lsn == qip->qli_flush_lsn) || test_bit(XFS_LI_FAILED, &lip->li_flags))) { - /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->ail_lock); if (lip->li_lsn == qip->qli_flush_lsn) { - xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + /* xfs_ail_update_finish() drops the AIL lock */ + tail_lsn = xfs_ail_delete_one(ailp, lip); + xfs_ail_update_finish(ailp, tail_lsn); } else { /* * Clear the failed state since we are about to drop the @@ -1068,6 +1105,7 @@ xfs_qm_dqflush( struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; + struct xfs_log_item *lip = &dqp->q_logitem.qli_item; struct xfs_buf *bp; struct xfs_dqblk *dqb; struct xfs_disk_dquot *ddqp; @@ -1084,31 +1122,15 @@ xfs_qm_dqflush( xfs_qm_dqunpin_wait(dqp); /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this dquot - * to disk, because the log record didn't make it to disk. - * - * We also have to remove the log item from the AIL in this case, - * as we wait for an emptry AIL as part of the unmount process. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - struct xfs_log_item *lip = &dqp->q_logitem.qli_item; - dqp->dq_flags &= ~XFS_DQ_DIRTY; - - xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE); - - error = -EIO; - goto out_unlock; - } - - /* * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, &bp, &xfs_dquot_buf_ops); - if (error) + if (error == -EAGAIN) goto out_unlock; + if (error) + goto out_abort; /* * Calculate the location of the dquot inside the buffer. @@ -1116,17 +1138,15 @@ xfs_qm_dqflush( dqb = bp->b_addr + dqp->q_bufoffset; ddqp = &dqb->dd_diskdq; - /* - * A simple sanity check in case we got a corrupted dquot. - */ - fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0); + /* sanity check the in-core structure before we flush */ + fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(dqp->q_core.d_id), + 0); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", - be32_to_cpu(ddqp->d_id), fa); + be32_to_cpu(dqp->q_core.d_id), fa); xfs_buf_relse(bp); - xfs_dqfunlock(dqp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + goto out_abort; } /* This is the only portion of data that needs to persist */ @@ -1175,6 +1195,10 @@ xfs_qm_dqflush( *bpp = bp; return 0; +out_abort: + dqp->dq_flags &= ~XFS_DQ_DIRTY; + xfs_trans_ail_delete(lip, 0); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); out_unlock: xfs_dqfunlock(dqp); return error; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index fe3e46df604b..71e36c85e20b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -154,7 +154,7 @@ void xfs_qm_dqdestroy(struct xfs_dquot *dqp); int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); void xfs_qm_adjust_dqtimers(struct xfs_mount *mp, - struct xfs_disk_dquot *d); + struct xfs_dquot *d); void xfs_qm_adjust_dqlimits(struct xfs_mount *mp, struct xfs_dquot *d); xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index baad1748d0d1..349c92d26570 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -145,21 +145,6 @@ xfs_qm_dquot_logitem_push( if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - /* - * The buffer containing this item failed to be written back - * previously. Resubmit the buffer for IO - */ - if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { - if (!xfs_buf_trylock(bp)) - return XFS_ITEM_LOCKED; - - if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list)) - rval = XFS_ITEM_FLUSHING; - - xfs_buf_unlock(bp); - return rval; - } - if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; @@ -358,7 +343,7 @@ xfs_qm_qoff_logitem_relse( ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) || test_bit(XFS_LI_ABORTED, &lip->li_flags) || XFS_FORCED_SHUTDOWN(lip->li_mountp)); - xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(lip, 0); kmem_free(lip->li_lv_shadow); kmem_free(qoff); } diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c new file mode 100644 index 000000000000..3400be4c88f0 --- /dev/null +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xlog_buf_readahead(log, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), + &xfs_dquot_buf_ra_ops); +} + +/* + * Recover a dquot record + */ +STATIC int +xlog_recover_dquot_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp; + struct xfs_disk_dquot *ddq, *recddq; + struct xfs_dq_logformat *dq_f; + xfs_failaddr_t fa; + int error; + uint type; + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (mp->m_qflags == 0) + return 0; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) { + xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); + return -EFSCORRUPTED; + } + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { + xfs_alert(log->l_mp, "dquot too small (%d) in %s.", + item->ri_buf[1].i_len, __func__); + return -EFSCORRUPTED; + } + + /* + * This type of quotas was turned off, so ignore this record. + */ + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return 0; + + /* + * At this point we know that quota was _not_ turned off. + * Since the mount flags are not indicating to us otherwise, this + * must mean that quota is on, and the dquot needs to be replayed. + * Remember that we may not have fully recovered the superblock yet, + * so we can't do the usual trick of looking at the SB quota bits. + * + * The other possibility, of course, is that the quota subsystem was + * removed since the last mount - ENOSYS. + */ + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); + if (fa) { + xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", + dq_f->qlf_id, fa); + return -EFSCORRUPTED; + } + ASSERT(dq_f->qlf_len == 1); + + /* + * At this point we are assuming that the dquots have been allocated + * and hence the buffer has valid dquots stamped in it. It should, + * therefore, pass verifier validation. If the dquot is bad, then the + * we'll return an error here, so we don't need to specifically check + * the dquot in the buffer after the verifier has run. + */ + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + &xfs_dquot_buf_ops); + if (error) + return error; + + ASSERT(bp); + ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); + + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + + memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + ASSERT(dq_f->qlf_size == 2); + ASSERT(bp->b_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); + return 0; +} + +const struct xlog_recover_item_ops xlog_dquot_item_ops = { + .item_type = XFS_LI_DQUOT, + .ra_pass2 = xlog_recover_dquot_ra_pass2, + .commit_pass2 = xlog_recover_dquot_commit_pass2, +}; + +/* + * Recover QUOTAOFF records. We simply make a note of it in the xlog + * structure, so that we know not to do any dquot item or dquot buffer recovery, + * of that type. + */ +STATIC int +xlog_recover_quotaoff_commit_pass1( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].i_addr; + ASSERT(qoff_f); + + /* + * The logitem format's flag tells us if this was user quotaoff, + * group/project quotaoff or both. + */ + if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_USER; + if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_PROJ; + if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_GROUP; + + return 0; +} + +const struct xlog_recover_item_ops xlog_quotaoff_item_ops = { + .item_type = XFS_LI_QUOTAOFF, + .commit_pass1 = xlog_recover_quotaoff_commit_pass1, + /* nothing to commit in pass2 */ +}; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index a21e9cc6516a..7f6e20899473 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -53,6 +53,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_FORCE_SCRUB_REPAIR, XFS_RANDOM_FORCE_SUMMARY_RECALC, XFS_RANDOM_IUNLINK_FALLBACK, + XFS_RANDOM_BUF_IOERROR, }; struct xfs_errortag_attr { @@ -162,6 +163,7 @@ XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); +XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -199,6 +201,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(force_repair), XFS_ERRORTAG_ATTR_LIST(bad_summary), XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), + XFS_ERRORTAG_ATTR_LIST(buf_ioerror), NULL, }; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6ea847f6e298..b9c333bae0a1 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -22,16 +22,20 @@ #include "xfs_bmap.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; +static const struct xfs_item_ops xfs_efi_item_ops; + static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_efi_log_item, efi_item); } -void +STATIC void xfs_efi_item_free( struct xfs_efi_log_item *efip) { @@ -49,13 +53,13 @@ xfs_efi_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the EFI. */ -void +STATIC void xfs_efi_release( struct xfs_efi_log_item *efip) { ASSERT(atomic_read(&efip->efi_refcount) > 0); if (atomic_dec_and_test(&efip->efi_refcount)) { - xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); } } @@ -139,18 +143,10 @@ xfs_efi_item_release( xfs_efi_release(EFI_ITEM(lip)); } -static const struct xfs_item_ops xfs_efi_item_ops = { - .iop_size = xfs_efi_item_size, - .iop_format = xfs_efi_item_format, - .iop_unpin = xfs_efi_item_unpin, - .iop_release = xfs_efi_item_release, -}; - - /* * Allocate and initialize an efi item with the given number of extents. */ -struct xfs_efi_log_item * +STATIC struct xfs_efi_log_item * xfs_efi_init( struct xfs_mount *mp, uint nextents) @@ -161,7 +157,7 @@ xfs_efi_init( ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(xfs_efi_log_item_t) + + size = (uint)(sizeof(struct xfs_efi_log_item) + ((nextents - 1) * sizeof(xfs_extent_t))); efip = kmem_zalloc(size, 0); } else { @@ -184,7 +180,7 @@ xfs_efi_init( * one of which will be the native format for this kernel. * It will handle the conversion of formats if necessary. */ -int +STATIC int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; @@ -412,41 +408,16 @@ xfs_extent_free_diff_items( XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); } -/* Get an EFI. */ -STATIC void * -xfs_extent_free_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_efi_log_item *efip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - efip = xfs_efi_init(tp->t_mountp, count); - ASSERT(efip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &efip->efi_item); - return efip; -} - /* Log a free extent to the intent item. */ STATIC void xfs_extent_free_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_efi_log_item *efip, + struct xfs_extent_free_item *free) { - struct xfs_efi_log_item *efip = intent; - struct xfs_extent_free_item *free; uint next_extent; struct xfs_extent *extp; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); @@ -462,29 +433,50 @@ xfs_extent_free_log_item( extp->ext_len = free->xefi_blockcount; } +static struct xfs_log_item * +xfs_extent_free_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); + struct xfs_extent_free_item *free; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &efip->efi_item); + if (sort) + list_sort(mp, items, xfs_extent_free_diff_items); + list_for_each_entry(free, items, xefi_list) + xfs_extent_free_log_item(tp, efip, free); + return &efip->efi_item; +} + /* Get an EFD so we can process all the free extents. */ -STATIC void * +static struct xfs_log_item * xfs_extent_free_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_efd(tp, intent, count); + return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; } /* Process a free extent. */ STATIC int xfs_extent_free_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_extent_free_item *free; int error; free = container_of(item, struct xfs_extent_free_item, xefi_list); - error = xfs_trans_free_extent(tp, done_item, + error = xfs_trans_free_extent(tp, EFD_ITEM(done), free->xefi_startblock, free->xefi_blockcount, &free->xefi_oinfo, free->xefi_skip_discard); @@ -495,9 +487,9 @@ xfs_extent_free_finish_item( /* Abort all pending EFIs. */ STATIC void xfs_extent_free_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_efi_release(intent); + xfs_efi_release(EFI_ITEM(intent)); } /* Cancel a free extent. */ @@ -513,10 +505,8 @@ xfs_extent_free_cancel_item( const struct xfs_defer_op_type xfs_extent_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, .create_done = xfs_extent_free_create_done, .finish_item = xfs_extent_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, @@ -529,12 +519,12 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = { STATIC int xfs_agfl_free_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_efd_log_item *efdp = done_item; + struct xfs_efd_log_item *efdp = EFD_ITEM(done); struct xfs_extent_free_item *free; struct xfs_extent *extp; struct xfs_buf *agbp; @@ -579,10 +569,8 @@ xfs_agfl_free_finish_item( /* sub-type with special handling for AGFL deferred frees */ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, .create_done = xfs_extent_free_create_done, .finish_item = xfs_agfl_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, @@ -592,19 +580,19 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. */ -int -xfs_efi_recover( - struct xfs_mount *mp, - struct xfs_efi_log_item *efip) +STATIC int +xfs_efi_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - struct xfs_efd_log_item *efdp; - struct xfs_trans *tp; - int i; - int error = 0; - xfs_extent_t *extp; - xfs_fsblock_t startblock_fsb; - - ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_efd_log_item *efdp; + struct xfs_trans *tp; + struct xfs_extent *extp; + xfs_fsblock_t startblock_fsb; + int i; + int error = 0; /* * First check the validity of the extents described by the @@ -623,7 +611,6 @@ xfs_efi_recover( * This will pull the EFI from the AIL and * free the memory associated with it. */ - set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip); return -EFSCORRUPTED; } @@ -644,7 +631,6 @@ xfs_efi_recover( } - set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); error = xfs_trans_commit(tp); return error; @@ -652,3 +638,93 @@ abort_error: xfs_trans_cancel(tp); return error; } + +STATIC bool +xfs_efi_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return EFI_ITEM(lip)->efi_format.efi_id == intent_id; +} + +static const struct xfs_item_ops xfs_efi_item_ops = { + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_unpin = xfs_efi_item_unpin, + .iop_release = xfs_efi_item_release, + .iop_recover = xfs_efi_item_recover, + .iop_match = xfs_efi_item_match, +}; + +/* + * This routine is called to create an in-core extent free intent + * item from the efi format structure which was logged on disk. + * It allocates an in-core efi, copies the extents from the format + * structure into it, and adds the efi to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_efi_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_efi_log_item *efip; + struct xfs_efi_log_format *efi_formatp; + int error; + + efi_formatp = item->ri_buf[0].i_addr; + + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); + error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); + if (error) { + xfs_efi_item_free(efip); + return error; + } + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn); + xfs_efi_release(efip); + return 0; +} + +const struct xlog_recover_item_ops xlog_efi_item_ops = { + .item_type = XFS_LI_EFI, + .commit_pass2 = xlog_recover_efi_commit_pass2, +}; + +/* + * This routine is called when an EFD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding EFI if it + * was still in the log. To do this it searches the AIL for the EFI with an id + * equal to that in the EFD format structure. If we find it we drop the EFD + * reference, which removes the EFI from the AIL and frees it. + */ +STATIC int +xlog_recover_efd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_efd_log_format *efd_formatp; + + efd_formatp = item->ri_buf[0].i_addr; + ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || + (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + + xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_efd_item_ops = { + .item_type = XFS_LI_EFD, + .commit_pass2 = xlog_recover_efd_commit_pass2, +}; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 16aaab06d4ec..cd2860c875bf 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -17,11 +17,6 @@ struct kmem_zone; #define XFS_EFI_MAX_FAST_EXTENTS 16 /* - * Define EFI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_EFI_RECOVERED 1 - -/* * This is the "extent free intention" log item. It is used to log the fact * that some extents need to be free. It is used in conjunction with the * "extent free done" log item described below. @@ -50,25 +45,24 @@ struct kmem_zone; * of commit failure or log I/O errors. Note that the EFD is not inserted in the * AIL, so at this point both the EFI and EFD are freed. */ -typedef struct xfs_efi_log_item { +struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; - unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; -} xfs_efi_log_item_t; +}; /* * This is the "extent free done" log item. It is used to log * the fact that some extents earlier mentioned in an efi item * have been freed. */ -typedef struct xfs_efd_log_item { +struct xfs_efd_log_item { struct xfs_log_item efd_item; - xfs_efi_log_item_t *efd_efip; + struct xfs_efi_log_item *efd_efip; uint efd_next_extent; xfs_efd_log_format_t efd_format; -} xfs_efd_log_item_t; +}; /* * Max number of extents in fast allocation path. @@ -78,13 +72,4 @@ typedef struct xfs_efd_log_item { extern struct kmem_zone *xfs_efi_zone; extern struct kmem_zone *xfs_efd_zone; -xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); -int xfs_efi_copy_format(xfs_log_iovec_t *buf, - xfs_efi_log_format_t *dst_efi_fmt); -void xfs_efi_item_free(xfs_efi_log_item_t *); -void xfs_efi_release(struct xfs_efi_log_item *); - -int xfs_efi_recover(struct xfs_mount *mp, - struct xfs_efi_log_item *efip); - #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4b8bdecc3863..403c90309a8f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1102,7 +1102,7 @@ xfs_dir_open( * certain to have the next operation be a read there. */ mode = xfs_ilock_data_map_shared(ip); - if (ip->i_d.di_nextents > 0) + if (ip->i_df.if_nextents > 0) error = xfs_dir3_data_readahead(ip, 0, 0); xfs_iunlock(ip, mode); return error; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 3e61d0cc23f8..ef1d5bb88b93 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -504,10 +504,7 @@ xfs_do_force_shutdown( } else if (logerror) { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + } else { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, "I/O Error Detected. Shutting down filesystem"); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 8bf1d15be3f6..0a5ac6f9a583 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -22,6 +22,7 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" +#include "xfs_ialloc.h" #include <linux/iversion.h> @@ -62,8 +63,6 @@ xfs_inode_alloc( memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); ip->i_afp = NULL; ip->i_cowfp = NULL; - ip->i_cnextents = 0; - ip->i_cformat = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(ip->i_df)); ip->i_flags = 0; ip->i_delayed_blks = 0; @@ -88,15 +87,18 @@ xfs_inode_free_callback( case S_IFREG: case S_IFDIR: case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); + xfs_idestroy_fork(&ip->i_df); break; } - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - if (ip->i_cowfp) - xfs_idestroy_fork(ip, XFS_COW_FORK); - + if (ip->i_afp) { + xfs_idestroy_fork(ip->i_afp); + kmem_cache_free(xfs_ifork_zone, ip->i_afp); + } + if (ip->i_cowfp) { + xfs_idestroy_fork(ip->i_cowfp); + kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); + } if (ip->i_itemp) { ASSERT(!test_bit(XFS_LI_IN_AIL, &ip->i_itemp->ili_item.li_flags)); @@ -423,6 +425,7 @@ xfs_iget_cache_hit( spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); if (error) { bool wake; @@ -456,9 +459,6 @@ xfs_iget_cache_hit( ip->i_sick = 0; ip->i_checked = 0; - ASSERT(!rwsem_is_locked(&inode->i_rwsem)); - init_rwsem(&inode->i_rwsem); - spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); } else { @@ -510,18 +510,42 @@ xfs_iget_cache_miss( if (!ip) return -ENOMEM; - error = xfs_iread(mp, tp, ip, flags); + error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); if (error) goto out_destroy; - if (!xfs_inode_verify_forks(ip)) { - error = -EFSCORRUPTED; - goto out_destroy; + /* + * For version 5 superblocks, if we are initialising a new inode and we + * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can + * simply build the new inode core with a random generation number. + * + * For version 4 (and older) superblocks, log recovery is dependent on + * the di_flushiter field being initialised from the current on-disk + * value and hence we must also read the inode off disk even when + * initializing new inodes. + */ + if (xfs_sb_version_has_v3inode(&mp->m_sb) && + (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { + VFS_I(ip)->i_generation = prandom_u32(); + } else { + struct xfs_dinode *dip; + struct xfs_buf *bp; + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0); + if (error) + goto out_destroy; + + error = xfs_inode_from_disk(ip, dip); + if (!error) + xfs_buf_set_ref(bp, XFS_INO_REF); + xfs_trans_brelse(tp, bp); + + if (error) + goto out_destroy; } trace_xfs_iget_miss(ip); - /* * Check the inode free state is valid. This also detects lookup * racing with unlinks. @@ -737,13 +761,18 @@ xfs_icache_inode_is_allocated( */ #define XFS_LOOKUP_BATCH 32 -STATIC int -xfs_inode_ag_walk_grab( +/* + * Decide if the given @ip is eligible to be a part of the inode walk, and + * grab it if so. Returns true if it's ready to go or false if we should just + * ignore it. + */ +STATIC bool +xfs_inode_walk_ag_grab( struct xfs_inode *ip, int flags) { struct inode *inode = VFS_I(ip); - bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); + bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); ASSERT(rcu_read_lock_held()); @@ -768,39 +797,41 @@ xfs_inode_ag_walk_grab( /* nothing to sync during shutdown */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EFSCORRUPTED; + return false; /* If we can't grab the inode, it must on it's way to reclaim. */ if (!igrab(inode)) - return -ENOENT; + return false; /* inode is valid */ - return 0; + return true; out_unlock_noent: spin_unlock(&ip->i_flags_lock); - return -ENOENT; + return false; } +/* + * For a given per-AG structure @pag, grab, @execute, and rele all incore + * inodes with the given radix tree @tag. + */ STATIC int -xfs_inode_ag_walk( - struct xfs_mount *mp, +xfs_inode_walk_ag( struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, + int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), void *args, - int tag, - int iter_flags) + int tag) { + struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; int last_error = 0; int skipped; - int done; + bool done; int nr_found; restart: - done = 0; + done = false; skipped = 0; first_index = 0; nr_found = 0; @@ -811,7 +842,7 @@ restart: rcu_read_lock(); - if (tag == -1) + if (tag == XFS_ICI_NO_TAG) nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); @@ -833,7 +864,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) + if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) batch[i] = NULL; /* @@ -852,7 +883,7 @@ restart: continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; + done = true; } /* unlock now we've grabbed the inodes. */ @@ -861,10 +892,10 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - if ((iter_flags & XFS_AGITER_INEW_WAIT) && + if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && xfs_iflags_test(batch[i], XFS_INEW)) xfs_inew_wait(batch[i]); - error = execute(batch[i], flags, args); + error = execute(batch[i], args); xfs_irele(batch[i]); if (error == -EAGAIN) { skipped++; @@ -889,6 +920,49 @@ restart: return last_error; } +/* Fetch the next (possibly tagged) per-AG structure. */ +static inline struct xfs_perag * +xfs_inode_walk_get_perag( + struct xfs_mount *mp, + xfs_agnumber_t agno, + int tag) +{ + if (tag == XFS_ICI_NO_TAG) + return xfs_perag_get(mp, agno); + return xfs_perag_get_tag(mp, agno, tag); +} + +/* + * Call the @execute function on all incore inodes matching the radix tree + * @tag. + */ +int +xfs_inode_walk( + struct xfs_mount *mp, + int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == -EFSCORRUPTED) + break; + } + } + return last_error; +} + /* * Background scanning to trim post-EOF preallocated space. This is queued * based on the 'speculative_prealloc_lifetime' tunable (5m by default). @@ -952,75 +1026,6 @@ xfs_cowblocks_worker( xfs_queue_cowblocks(mp); } -int -xfs_inode_ag_iterator_flags( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, - void *args, - int iter_flags) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - - ag = 0; - while ((pag = xfs_perag_get(mp, ag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, - iter_flags); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == -EFSCORRUPTED) - break; - } - } - return last_error; -} - -int -xfs_inode_ag_iterator( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, - void *args) -{ - return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); -} - -int -xfs_inode_ag_iterator_tag( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, - void *args, - int tag) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - - ag = 0; - while ((pag = xfs_perag_get_tag(mp, ag, tag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, - 0); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == -EFSCORRUPTED) - break; - } - } - return last_error; -} - /* * Grab the inode for reclaim exclusively. * Return 0 if we grabbed it, non-zero otherwise. @@ -1128,7 +1133,7 @@ restart: if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); /* xfs_iflush_abort() drops the flush lock */ - xfs_iflush_abort(ip, false); + xfs_iflush_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) { @@ -1419,59 +1424,90 @@ xfs_reclaim_inodes_count( return reclaimable; } -STATIC int +STATIC bool xfs_inode_match_id( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) - return 0; + return false; if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) - return 0; + return false; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && ip->i_d.di_projid != eofb->eof_prid) - return 0; + return false; - return 1; + return true; } /* * A union-based inode filtering algorithm. Process the inode if any of the * criteria match. This is for global/internal scans only. */ -STATIC int +STATIC bool xfs_inode_match_id_union( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) - return 1; + return true; if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) - return 1; + return true; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && ip->i_d.di_projid == eofb->eof_prid) - return 1; + return true; - return 0; + return false; +} + +/* + * Is this inode @ip eligible for eof/cow block reclamation, given some + * filtering parameters @eofb? The inode is eligible if @eofb is null or + * if the predicate functions match. + */ +static bool +xfs_inode_matches_eofb( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + bool match; + + if (!eofb) + return true; + + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) + return false; + + /* skip the inode if the file size is too small */ + if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return false; + + return true; } STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - int flags, void *args) { - int ret = 0; - struct xfs_eofblocks *eofb = args; - int match; + struct xfs_eofblocks *eofb = args; + bool wait; + int ret; + + wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ @@ -1484,62 +1520,34 @@ xfs_inode_free_eofblocks( * If the mapping is dirty the operation can block and wait for some * time. Unless we are waiting, skip it. */ - if (!(flags & SYNC_WAIT) && - mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) - match = xfs_inode_match_id_union(ip, eofb); - else - match = xfs_inode_match_id(ip, eofb); - if (!match) - return 0; - - /* skip the inode if the file size is too small */ - if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && - XFS_ISIZE(ip) < eofb->eof_min_file_size) - return 0; - } + if (!xfs_inode_matches_eofb(ip, eofb)) + return 0; /* * If the caller is waiting, return -EAGAIN to keep the background * scanner moving and revisit the inode in a subsequent pass. */ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - if (flags & SYNC_WAIT) - ret = -EAGAIN; - return ret; + if (wait) + return -EAGAIN; + return 0; } + ret = xfs_free_eofblocks(ip); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } -static int -__xfs_icache_free_eofblocks( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int tag) -{ - int flags = SYNC_TRYLOCK; - - if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) - flags = SYNC_WAIT; - - return xfs_inode_ag_iterator_tag(mp, execute, flags, - eofb, tag); -} - int xfs_icache_free_eofblocks( struct xfs_mount *mp, struct xfs_eofblocks *eofb) { - return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, + return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, XFS_ICI_EOFBLOCKS_TAG); } @@ -1756,29 +1764,16 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - int flags, void *args) { struct xfs_eofblocks *eofb = args; - int match; int ret = 0; if (!xfs_prep_free_cowblocks(ip)) return 0; - if (eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) - match = xfs_inode_match_id_union(ip, eofb); - else - match = xfs_inode_match_id(ip, eofb); - if (!match) - return 0; - - /* skip the inode if the file size is too small */ - if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && - XFS_ISIZE(ip) < eofb->eof_min_file_size) - return 0; - } + if (!xfs_inode_matches_eofb(ip, eofb)) + return 0; /* Free the CoW blocks */ xfs_ilock(ip, XFS_IOLOCK_EXCL); @@ -1802,7 +1797,7 @@ xfs_icache_free_cowblocks( struct xfs_mount *mp, struct xfs_eofblocks *eofb) { - return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, + return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, XFS_ICI_COWBLOCKS_TAG); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 48f1fd2bb6ad..93b54e7d55f0 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -24,7 +24,7 @@ struct xfs_eofblocks { * tags for inode radix tree */ #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup - in xfs_inode_ag_iterator */ + in xfs_inode_walk */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ #define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ #define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */ @@ -40,7 +40,7 @@ struct xfs_eofblocks { /* * flags for AG inode iterator */ -#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */ +#define XFS_INODE_WALK_INEW_WAIT 0x1 /* wait on new inodes */ int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -71,50 +71,9 @@ int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); void xfs_cowblocks_worker(struct work_struct *); void xfs_queue_cowblocks(struct xfs_mount *); -int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, void *args), - int flags, void *args); -int xfs_inode_ag_iterator_flags(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, void *args), - int flags, void *args, int iter_flags); -int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, void *args), - int flags, void *args, int tag); - -static inline int -xfs_fs_eofblocks_from_user( - struct xfs_fs_eofblocks *src, - struct xfs_eofblocks *dst) -{ - if (src->eof_version != XFS_EOFBLOCKS_VERSION) - return -EINVAL; - - if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) - return -EINVAL; - - if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || - memchr_inv(src->pad64, 0, sizeof(src->pad64))) - return -EINVAL; - - dst->eof_flags = src->eof_flags; - dst->eof_prid = src->eof_prid; - dst->eof_min_file_size = src->eof_min_file_size; - - dst->eof_uid = INVALID_UID; - if (src->eof_flags & XFS_EOF_FLAGS_UID) { - dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); - if (!uid_valid(dst->eof_uid)) - return -EINVAL; - } - - dst->eof_gid = INVALID_GID; - if (src->eof_flags & XFS_EOF_FLAGS_GID) { - dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); - if (!gid_valid(dst->eof_gid)) - return -EINVAL; - } - return 0; -} +int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, int tag); int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 490fee22b878..287a9e5c7d75 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -6,11 +6,19 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_icreate_item.h" #include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_ialloc.h" +#include "xfs_trace.h" kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ @@ -107,3 +115,147 @@ xfs_icreate_log( tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags); } + +static enum xlog_recover_reorder +xlog_recover_icreate_reorder( + struct xlog_recover_item *item) +{ + /* + * Inode allocation buffers must be replayed before subsequent inode + * items try to modify those buffers. ICREATE items are the logical + * equivalent of logging a newly initialized inode buffer, so recover + * these at the same time that we recover logged buffers. + */ + return XLOG_REORDER_BUFFER_LIST; +} + +/* + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be initialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. + */ +STATIC int +xlog_recover_icreate_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + int bb_per_cluster; + int cancel_count; + int nbufs; + int i; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return -EINVAL; + } + + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return -EINVAL; + } + + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return -EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return -EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return -EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return -EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return -EINVAL; + } + + /* + * The inode chunk is either full or sparse and we only support + * m_ino_geo.ialloc_min_blks sized sparse allocations at this time. + */ + if (length != igeo->ialloc_blks && + length != igeo->ialloc_min_blks) { + xfs_warn(log->l_mp, + "%s: unsupported chunk length", __FUNCTION__); + return -EINVAL; + } + + /* verify inode count is consistent with extent length */ + if ((count >> mp->m_sb.sb_inopblog) != length) { + xfs_warn(log->l_mp, + "%s: inconsistent inode count and chunk length", + __FUNCTION__); + return -EINVAL; + } + + /* + * The icreate transaction can cover multiple cluster buffers and these + * buffers could have been freed and reused. Check the individual + * buffers for cancellation so we don't overwrite anything written after + * a cancellation. + */ + bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); + nbufs = length / igeo->blocks_per_cluster; + for (i = 0, cancel_count = 0; i < nbufs; i++) { + xfs_daddr_t daddr; + + daddr = XFS_AGB_TO_DADDR(mp, agno, + agbno + i * igeo->blocks_per_cluster); + if (xlog_is_buffer_cancelled(log, daddr, bb_per_cluster)) + cancel_count++; + } + + /* + * We currently only use icreate for a single allocation at a time. This + * means we should expect either all or none of the buffers to be + * cancelled. Be conservative and skip replay if at least one buffer is + * cancelled, but warn the user that something is awry if the buffers + * are not consistent. + * + * XXX: This must be refined to only skip cancelled clusters once we use + * icreate for multiple chunk allocations. + */ + ASSERT(!cancel_count || cancel_count == nbufs); + if (cancel_count) { + if (cancel_count != nbufs) + xfs_warn(mp, + "WARNING: partial inode chunk cancellation, skipped icreate."); + trace_xfs_log_recover_icreate_cancel(log, icl); + return 0; + } + + trace_xfs_log_recover_icreate_recover(log, icl); + return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, + length, be32_to_cpu(icl->icl_gen)); +} + +const struct xlog_recover_item_ops xlog_icreate_item_ops = { + .item_type = XFS_LI_ICREATE, + .reorder = xlog_recover_icreate_reorder, + .commit_pass2 = xlog_recover_icreate_commit_pass2, +}; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d1772786af29..64f5f9a440ae 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -112,7 +112,7 @@ xfs_ilock_data_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE && (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -125,7 +125,8 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + if (ip->i_afp && + ip->i_afp->if_format == XFS_DINODE_FMT_BTREE && (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -825,7 +826,7 @@ xfs_ialloc( inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; - ip->i_d.di_nextents = 0; + ip->i_df.if_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); tv = current_time(inode); @@ -851,7 +852,7 @@ xfs_ialloc( case S_IFCHR: case S_IFBLK: case S_IFSOCK: - ip->i_d.di_format = XFS_DINODE_FMT_DEV; + ip->i_df.if_format = XFS_DINODE_FMT_DEV; ip->i_df.if_flags = 0; flags |= XFS_ILOG_DEV; break; @@ -907,7 +908,7 @@ xfs_ialloc( } /* FALLTHROUGH */ case S_IFLNK: - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = 0; ip->i_df.if_u1.if_root = NULL; @@ -915,11 +916,6 @@ xfs_ialloc( default: ASSERT(0); } - /* - * Attribute fork settings for new inode. - */ - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_anextents = 0; /* * Log the new values stuffed into the inode. @@ -1686,7 +1682,7 @@ xfs_inactive_truncate( if (error) goto error_trans_cancel; - ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_df.if_nextents == 0); error = xfs_trans_commit(tp); if (error) @@ -1836,7 +1832,7 @@ xfs_inactive( if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; error = xfs_qm_dqattach(ip); @@ -1862,7 +1858,6 @@ xfs_inactive( } ASSERT(!ip->i_afp); - ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_d.di_forkoff == 0); /* @@ -2172,7 +2167,7 @@ xfs_iunlink_update_inode( ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0); if (error) return error; @@ -2302,7 +2297,7 @@ xfs_iunlink_map_ino( return error; } - error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0); if (error) { xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", __func__, error); @@ -2602,7 +2597,7 @@ xfs_ifree_cluster( xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; struct xfs_log_item *lip; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); @@ -2662,7 +2657,7 @@ xfs_ifree_cluster( */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; + iip = (struct xfs_inode_log_item *)lip; ASSERT(iip->ili_logged == 1); lip->li_cb = xfs_istale_done; xfs_trans_ail_copy_lsn(mp->m_ail, @@ -2712,24 +2707,6 @@ xfs_ifree_cluster( } /* - * Free any local-format buffers sitting around before we reset to - * extents format. - */ -static inline void -xfs_ifree_local_data( - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_ifork *ifp; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return; - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); -} - -/* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have * no pages associated with it. This routine also assumes that @@ -2749,8 +2726,7 @@ xfs_ifree( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(ip->i_d.di_nextents == 0); - ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_df.if_nextents == 0); ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); ASSERT(ip->i_d.di_nblocks == 0); @@ -2765,16 +2741,23 @@ xfs_ifree( if (error) return error; - xfs_ifree_local_data(ip, XFS_DATA_FORK); - xfs_ifree_local_data(ip, XFS_ATTR_FORK); + /* + * Free any local-format data sitting around before we reset the + * data fork to extents format. Note that the attr fork data has + * already been freed by xfs_attr_inactive. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + kmem_free(ip->i_df.if_u1.if_data); + ip->i_df.if_u1.if_data = NULL; + ip->i_df.if_bytes = 0; + } VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_flags2 = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* Don't attempt to replay owner changes for a deleted inode */ ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); @@ -3496,6 +3479,7 @@ xfs_iflush_cluster( struct xfs_inode **cilist; struct xfs_inode *cip; struct xfs_ino_geometry *igeo = M_IGEO(mp); + int error = 0; int nr_found; int clcount = 0; int i; @@ -3588,11 +3572,10 @@ xfs_iflush_cluster( * re-check that it's dirty before flushing. */ if (!xfs_inode_clean(cip)) { - int error; error = xfs_iflush_int(cip, bp); if (error) { xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto cluster_corrupt_out; + goto out_free; } clcount++; } else { @@ -3611,37 +3594,7 @@ out_free: kmem_free(cilist); out_put: xfs_perag_put(pag); - return 0; - - -cluster_corrupt_out: - /* - * Corruption detected in the clustering loop. Invalidate the - * inode buffer and shut down the filesystem. - */ - rcu_read_unlock(); - - /* - * We'll always have an inode attached to the buffer for completion - * process by the time we are called from xfs_iflush(). Hence we have - * always need to do IO completion processing to abort the inodes - * attached to the buffer. handle them just like the shutdown case in - * xfs_buf_submit(). - */ - ASSERT(bp->b_iodone); - bp->b_flags |= XBF_ASYNC; - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp); - - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(cip, false); - kmem_free(cilist); - xfs_perag_put(pag); - return -EFSCORRUPTED; + return error; } /* @@ -3667,8 +3620,8 @@ xfs_iflush( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || + ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); *bpp = NULL; @@ -3688,42 +3641,20 @@ xfs_iflush( } /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk. - * - * We also have to remove the log item from the AIL in this case, - * as we wait for an empty AIL as part of the unmount process. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - goto abort_out; - } - - /* * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, we - * simply want to return with the inode still dirty. + * operation here, so we may get an EAGAIN error. In that case, return + * leaving the inode dirty. * * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode, so we treat it the same as failing - * xfs_iflush_int(). + * and we cannot flush the inode. Abort the flush and shut down. */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, - 0); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK); if (error == -EAGAIN) { xfs_ifunlock(ip); return error; } if (error) - goto corrupt_out; - - /* - * First flush out the inode that xfs_iflush was called with. - */ - error = xfs_iflush_int(ip, bp); - if (error) - goto corrupt_out; + goto abort; /* * If the buffer is pinned then push on the log now so we won't @@ -3733,61 +3664,32 @@ xfs_iflush( xfs_log_force(mp, 0); /* - * inode clustering: try to gather other inodes into this write + * Flush the provided inode then attempt to gather others from the + * cluster into the write. * - * Note: Any error during clustering will result in the filesystem - * being shut down and completion callbacks run on the cluster buffer. - * As we have already flushed and attached this inode to the buffer, - * it has already been aborted and released by xfs_iflush_cluster() and - * so we have no further error handling to do here. + * Note: Once we attempt to flush an inode, we must run buffer + * completion callbacks on any failure. If this fails, simulate an I/O + * failure on the buffer and shut down. */ - error = xfs_iflush_cluster(ip, bp); - if (error) - return error; + error = xfs_iflush_int(ip, bp); + if (!error) + error = xfs_iflush_cluster(ip, bp); + if (error) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + goto shutdown; + } *bpp = bp; return 0; -corrupt_out: - if (bp) - xfs_buf_relse(bp); +abort: + xfs_iflush_abort(ip); +shutdown: xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -abort_out: - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(ip, false); return error; } -/* - * If there are inline format data / attr forks attached to this inode, - * make sure they're not corrupt. - */ -bool -xfs_inode_verify_forks( - struct xfs_inode *ip) -{ - struct xfs_ifork *ifp; - xfs_failaddr_t fa; - - fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", - ifp->if_u1.if_data, ifp->if_bytes, fa); - return false; - } - - fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp ? ifp->if_u1.if_data : NULL, - ifp ? ifp->if_bytes : 0, fa); - return false; - } - return true; -} - STATIC int xfs_iflush_int( struct xfs_inode *ip, @@ -3796,61 +3698,68 @@ xfs_iflush_int( struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || + ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); - /* set *dip = inode's place in the buffer */ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); + /* + * We don't flush the inode if any of the following checks fail, but we + * do still update the log item and attach to the backing buffer as if + * the flush happened. This is a formality to facilitate predictable + * error handling as the caller will shutdown and fail the buffer. + */ + error = -EFSCORRUPTED; if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); - goto corrupt_out; + goto flush_out; } if (S_ISREG(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && - (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > + if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, __func__, ip->i_ino, - ip->i_d.di_nextents + ip->i_d.di_anextents, + ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), ip->i_d.di_nblocks, ip); - goto corrupt_out; + goto flush_out; } if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_d.di_forkoff, ip); - goto corrupt_out; + goto flush_out; } /* @@ -3865,9 +3774,16 @@ xfs_iflush_int( if (!xfs_sb_version_has_v3inode(&mp->m_sb)) ip->i_d.di_flushiter++; - /* Check the inline fork data before we write out. */ - if (!xfs_inode_verify_forks(ip)) - goto corrupt_out; + /* + * If there are inline format data / attr forks attached to this inode, + * make sure they are not corrupt. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_data(ip)) + goto flush_out; + if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_attr(ip)) + goto flush_out; /* * Copy the dirty parts of the inode into the on-disk inode. We always @@ -3910,6 +3826,8 @@ xfs_iflush_int( * need the AIL lock, because it is a 64 bit value that cannot be read * atomically. */ + error = 0; +flush_out: iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; @@ -3919,10 +3837,10 @@ xfs_iflush_int( &iip->ili_item.li_lsn); /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. + * Attach the inode item callback to the buffer whether the flush + * succeeded or not. If not, the caller will shut down and fail I/O + * completion on the buffer to remove the inode from the AIL and release + * the flush lock. */ xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); @@ -3931,10 +3849,7 @@ xfs_iflush_int( ASSERT(!list_empty(&bp->b_li_list)); ASSERT(bp->b_iodone != NULL); - return 0; - -corrupt_out: - return -EFSCORRUPTED; + return error; } /* Release an inode. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c6a63f6764a6..dadcf1945896 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -57,9 +57,6 @@ typedef struct xfs_inode { struct xfs_icdinode i_d; /* most of ondisk inode */ - xfs_extnum_t i_cnextents; /* # of extents in cow fork */ - unsigned int i_cformat; /* format of cow fork */ - /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ @@ -467,6 +464,7 @@ int xfs_break_layouts(struct inode *inode, uint *iolock, /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); +extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); /* * When setting up a newly allocated inode, we need to call @@ -497,8 +495,6 @@ extern struct kmem_zone *xfs_inode_zone; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 -bool xfs_inode_verify_forks(struct xfs_inode *ip); - int xfs_iunlink_init(struct xfs_perag *pag); void xfs_iunlink_destroy(struct xfs_perag *pag); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f779cca2346f..ba47bf65b772 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -36,10 +36,10 @@ xfs_inode_item_data_fork_size( { struct xfs_inode *ip = iip->ili_inode; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_DEXT) && - ip->i_d.di_nextents > 0 && + ip->i_df.if_nextents > 0 && ip->i_df.if_bytes > 0) { /* worst case, doesn't subtract delalloc extents */ *nbytes += XFS_IFORK_DSIZE(ip); @@ -77,10 +77,10 @@ xfs_inode_item_attr_fork_size( { struct xfs_inode *ip = iip->ili_inode; - switch (ip->i_d.di_aformat) { + switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_d.di_anextents > 0 && + ip->i_afp->if_nextents > 0 && ip->i_afp->if_bytes > 0) { /* worst case, doesn't subtract unused space */ *nbytes += XFS_IFORK_ASIZE(ip); @@ -142,13 +142,13 @@ xfs_inode_item_format_data_fork( struct xfs_inode *ip = iip->ili_inode; size_t data_bytes; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DEXT) && - ip->i_d.di_nextents > 0 && + ip->i_df.if_nextents > 0 && ip->i_df.if_bytes > 0) { struct xfs_bmbt_rec *p; @@ -227,18 +227,18 @@ xfs_inode_item_format_attr_fork( struct xfs_inode *ip = iip->ili_inode; size_t data_bytes; - switch (ip->i_d.di_aformat) { + switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_d.di_anextents > 0 && + ip->i_afp->if_nextents > 0 && ip->i_afp->if_bytes > 0) { struct xfs_bmbt_rec *p; ASSERT(xfs_iext_count(ip->i_afp) == - ip->i_d.di_anextents); + ip->i_afp->if_nextents); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); @@ -305,7 +305,7 @@ xfs_inode_to_log_dinode( struct inode *inode = VFS_I(ip); to->di_magic = XFS_DINODE_MAGIC; - to->di_format = from->di_format; + to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = i_uid_read(inode); to->di_gid = i_gid_read(inode); to->di_projid_lo = from->di_projid & 0xffff; @@ -326,10 +326,10 @@ xfs_inode_to_log_dinode( to->di_size = from->di_size; to->di_nblocks = from->di_nblocks; to->di_extsize = from->di_extsize; - to->di_nextents = from->di_nextents; - to->di_anextents = from->di_anextents; + to->di_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_anextents = xfs_ifork_nextents(ip->i_afp); to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; + to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_dmevmask = from->di_dmevmask; to->di_dmstate = from->di_dmstate; to->di_flags = from->di_flags; @@ -497,21 +497,6 @@ xfs_inode_item_push( if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; - /* - * The buffer containing this item failed to be written back - * previously. Resubmit the buffer for IO. - */ - if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { - if (!xfs_buf_trylock(bp)) - return XFS_ITEM_LOCKED; - - if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list)) - rval = XFS_ITEM_FLUSHING; - - xfs_buf_unlock(bp); - return rval; - } - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; @@ -777,17 +762,12 @@ xfs_iflush_done( */ void xfs_iflush_abort( - xfs_inode_t *ip, - bool stale) + struct xfs_inode *ip) { - xfs_inode_log_item_t *iip = ip->i_itemp; + struct xfs_inode_log_item *iip = ip->i_itemp; if (iip) { - if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) { - xfs_trans_ail_remove(&iip->ili_item, - stale ? SHUTDOWN_LOG_IO_ERROR : - SHUTDOWN_CORRUPT_INCORE); - } + xfs_trans_ail_delete(&iip->ili_item, 0); iip->ili_logged = 0; /* * Clear the ili_last_fields bits now that we know that the @@ -812,7 +792,7 @@ xfs_istale_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true); + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 07a60e74c39c..60b34bb66e8e 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -13,7 +13,7 @@ struct xfs_bmbt_rec; struct xfs_inode; struct xfs_mount; -typedef struct xfs_inode_log_item { +struct xfs_inode_log_item { struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ @@ -23,7 +23,7 @@ typedef struct xfs_inode_log_item { unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ -} xfs_inode_log_item_t; +}; static inline int xfs_inode_clean(xfs_inode_t *ip) { @@ -34,7 +34,7 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); -extern void xfs_iflush_abort(struct xfs_inode *, bool); +extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c new file mode 100644 index 000000000000..dc3e26ff16c9 --- /dev/null +++ b/fs/xfs/xfs_inode_item_recover.c @@ -0,0 +1,394 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_log.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; + + xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, + &xfs_inode_buf_ra_ops); + } else { + struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; + + xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, + &xfs_inode_buf_ra_ops); + } +} + +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + +STATIC int +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return -ENOMEM; + + /* instantiate the inode */ + ASSERT(dip->di_version >= 3); + + error = xfs_inode_from_disk(ip, dip); + if (error) + goto out_free_ip; + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + +STATIC int +xlog_recover_inode_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + struct xfs_inode_log_format *in_f; + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp; + struct xfs_dinode *dip; + int len; + char *src; + char *dest; + int error; + int attr_index; + uint fields; + struct xfs_log_dinode *ldip; + uint isize; + int need_free = 0; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + in_f = item->ri_buf[0].i_addr; + } else { + in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); + need_free = 1; + error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); + if (error) + goto error; + } + + /* + * Inode buffers can be freed, look out for it, + * and do not replay the inode. + */ + if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) { + error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); + goto error; + } + trace_xfs_log_recover_inode_recover(log, in_f); + + error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, + 0, &bp, &xfs_inode_buf_ops); + if (error) + goto error; + ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); + dip = xfs_buf_offset(bp, in_f->ilf_boffset); + + /* + * Make sure the place we're flushing out to really looks + * like an inode! + */ + if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { + xfs_alert(mp, + "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", + __func__, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + ldip = item->ri_buf[1].i_addr; + if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { + xfs_alert(mp, + "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", + __func__, item, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + + /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; + } + } + + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_has_v3inode(&mp->m_sb) && + ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { + /* do nothing */ + } else { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_release; + } + } + + /* Take the opportunity to reset the flush iteration count */ + ldip->di_flushiter = 0; + + if (unlikely(S_ISREG(ldip->di_mode))) { + if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && + (ldip->di_format != XFS_DINODE_FMT_BTREE)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad regular inode log record, rec ptr "PTR_FMT", " + "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + } else if (unlikely(S_ISDIR(ldip->di_mode))) { + if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && + (ldip->di_format != XFS_DINODE_FMT_BTREE) && + (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad dir inode log record, rec ptr "PTR_FMT", " + "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + } + if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " + "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", + __func__, item, dip, bp, in_f->ilf_ino, + ldip->di_nextents + ldip->di_anextents, + ldip->di_nblocks); + error = -EFSCORRUPTED; + goto out_release; + } + if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " + "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, + item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); + error = -EFSCORRUPTED; + goto out_release; + } + isize = xfs_log_dinode_size(mp); + if (unlikely(item->ri_buf[1].i_len > isize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad inode log record length %d, rec ptr "PTR_FMT, + __func__, item->ri_buf[1].i_len, item); + error = -EFSCORRUPTED; + goto out_release; + } + + /* recover the log dinode inode into the on disk inode */ + xfs_log_dinode_to_disk(ldip, dip); + + fields = in_f->ilf_fields; + if (fields & XFS_ILOG_DEV) + xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); + + if (in_f->ilf_size == 2) + goto out_owner_change; + len = item->ri_buf[2].i_len; + src = item->ri_buf[2].i_addr; + ASSERT(in_f->ilf_size <= 4); + ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); + ASSERT(!(fields & XFS_ILOG_DFORK) || + (len == in_f->ilf_dsize)); + + switch (fields & XFS_ILOG_DFORK) { + case XFS_ILOG_DDATA: + case XFS_ILOG_DEXT: + memcpy(XFS_DFORK_DPTR(dip), src, len); + break; + + case XFS_ILOG_DBROOT: + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, + (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip), + XFS_DFORK_DSIZE(dip, mp)); + break; + + default: + /* + * There are no data fork flags set. + */ + ASSERT((fields & XFS_ILOG_DFORK) == 0); + break; + } + + /* + * If we logged any attribute data, recover it. There may or + * may not have been any other non-core data logged in this + * transaction. + */ + if (in_f->ilf_fields & XFS_ILOG_AFORK) { + if (in_f->ilf_fields & XFS_ILOG_DFORK) { + attr_index = 3; + } else { + attr_index = 2; + } + len = item->ri_buf[attr_index].i_len; + src = item->ri_buf[attr_index].i_addr; + ASSERT(len == in_f->ilf_asize); + + switch (in_f->ilf_fields & XFS_ILOG_AFORK) { + case XFS_ILOG_ADATA: + case XFS_ILOG_AEXT: + dest = XFS_DFORK_APTR(dip); + ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); + memcpy(dest, src, len); + break; + + case XFS_ILOG_ABROOT: + dest = XFS_DFORK_APTR(dip); + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, + len, (struct xfs_bmdr_block *)dest, + XFS_DFORK_ASIZE(dip, mp)); + break; + + default: + xfs_warn(log->l_mp, "%s: Invalid flag", __func__); + ASSERT(0); + error = -EFSCORRUPTED; + goto out_release; + } + } + +out_owner_change: + /* Recover the swapext owner change unless inode has been deleted */ + if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && + (dip->di_mode != 0)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + + ASSERT(bp->b_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); +error: + if (need_free) + kmem_free(in_f); + return error; +} + +const struct xlog_recover_item_ops xlog_inode_item_ops = { + .item_type = XFS_LI_INODE, + .ra_pass2 = xlog_recover_inode_ra_pass2, + .commit_pass2 = xlog_recover_inode_commit_pass2, +}; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 309958186d33..a40f88cf3ab7 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1104,26 +1104,17 @@ xfs_fill_fsxattr( bool attr, struct fsxattr *fa) { + struct xfs_ifork *ifp = attr ? ip->i_afp : &ip->i_df; + simple_fill_fsxattr(fa, xfs_ip2xflags(ip)); fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; fa->fsx_cowextsize = ip->i_d.di_cowextsize << ip->i_mount->m_sb.sb_blocklog; fa->fsx_projid = ip->i_d.di_projid; - - if (attr) { - if (ip->i_afp) { - if (ip->i_afp->if_flags & XFS_IFEXTENTS) - fa->fsx_nextents = xfs_iext_count(ip->i_afp); - else - fa->fsx_nextents = ip->i_d.di_anextents; - } else - fa->fsx_nextents = 0; - } else { - if (ip->i_df.if_flags & XFS_IFEXTENTS) - fa->fsx_nextents = xfs_iext_count(&ip->i_df); - else - fa->fsx_nextents = ip->i_d.di_nextents; - } + if (ifp && (ifp->if_flags & XFS_IFEXTENTS)) + fa->fsx_nextents = xfs_iext_count(ifp); + else + fa->fsx_nextents = xfs_ifork_nextents(ifp); } STATIC int @@ -1201,37 +1192,6 @@ xfs_flags2diflags2( return di_flags2; } -STATIC void -xfs_diflags_to_linux( - struct xfs_inode *ip) -{ - struct inode *inode = VFS_I(ip); - unsigned int xflags = xfs_ip2xflags(ip); - - if (xflags & FS_XFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (xflags & FS_XFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (xflags & FS_XFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (xflags & FS_XFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; -#if 0 /* disabled until the flag switching races are sorted out */ - if (xflags & FS_XFLAG_DAX) - inode->i_flags |= S_DAX; - else - inode->i_flags &= ~S_DAX; -#endif -} - static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, @@ -1242,7 +1202,7 @@ xfs_ioctl_setattr_xflags( uint64_t di_flags2; /* Can't change realtime flag if any extents are allocated. */ - if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + if ((ip->i_df.if_nextents || ip->i_delayed_blks) && XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) return -EINVAL; @@ -1269,7 +1229,7 @@ xfs_ioctl_setattr_xflags( ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); ip->i_d.di_flags2 = di_flags2; - xfs_diflags_to_linux(ip); + xfs_diflags_to_iflags(ip, false); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -1420,7 +1380,7 @@ xfs_ioctl_setattr_check_extsize( xfs_extlen_t size; xfs_fsblock_t extsize_fsb; - if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents && + if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) return -EINVAL; @@ -1513,7 +1473,6 @@ xfs_ioctl_setattr( struct fsxattr old_fa; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - struct xfs_dquot *udqp = NULL; struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; int code; @@ -1536,7 +1495,7 @@ xfs_ioctl_setattr( if (XFS_IS_QUOTA_ON(mp)) { code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, VFS_I(ip)->i_gid, fa->fsx_projid, - XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); + XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp); if (code) return code; } @@ -1560,7 +1519,7 @@ xfs_ioctl_setattr( if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && ip->i_d.di_projid != fa->fsx_projid) { - code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, + code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ goto error_trans_cancel; @@ -1626,7 +1585,6 @@ xfs_ioctl_setattr( * Release any dquot(s) the inode had kept before chown. */ xfs_qm_dqrele(olddquot); - xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); return code; @@ -1634,7 +1592,6 @@ xfs_ioctl_setattr( error_trans_cancel: xfs_trans_cancel(tp); error_free_dquots: - xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); return code; } @@ -2082,6 +2039,41 @@ out: return error; } +static inline int +xfs_fs_eofblocks_from_user( + struct xfs_fs_eofblocks *src, + struct xfs_eofblocks *dst) +{ + if (src->eof_version != XFS_EOFBLOCKS_VERSION) + return -EINVAL; + + if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) + return -EINVAL; + + if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || + memchr_inv(src->pad64, 0, sizeof(src->pad64))) + return -EINVAL; + + dst->eof_flags = src->eof_flags; + dst->eof_prid = src->eof_prid; + dst->eof_min_file_size = src->eof_min_file_size; + + dst->eof_uid = INVALID_UID; + if (src->eof_flags & XFS_EOF_FLAGS_UID) { + dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->eof_uid)) + return -EINVAL; + } + + dst->eof_gid = INVALID_GID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) { + dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->eof_gid)) + return -EINVAL; + } + return 0; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index bb590a267a7f..b9a8c3798e08 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -352,22 +352,10 @@ xfs_quota_calc_throttle( } /* - * If we are doing a write at the end of the file and there are no allocations - * past this one, then extend the allocation out to the file system's write - * iosize. - * * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the - * filesystem is to full, the smaller the maximum prealocation. - * - * As an exception we don't do any preallocation at all if the file is smaller - * than the minimum preallocation and we are using the default dynamic - * preallocation scheme, as it is likely this is the only write to the file that - * is going to be done. - * - * We clean up any extra space left over when the file is closed in - * xfs_inactive(). + * filesystem is to being full, the smaller the maximum preallocation. */ STATIC xfs_fsblock_t xfs_iomap_prealloc_size( @@ -377,63 +365,70 @@ xfs_iomap_prealloc_size( loff_t count, struct xfs_iext_cursor *icur) { + struct xfs_iext_cursor ncur = *icur; + struct xfs_bmbt_irec prev, got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); - struct xfs_bmbt_irec prev; - int shift = 0; int64_t freesp; xfs_fsblock_t qblocks; - int qshift = 0; xfs_fsblock_t alloc_blocks = 0; + xfs_extlen_t plen; + int shift = 0; + int qshift = 0; - if (offset + count <= XFS_ISIZE(ip)) - return 0; - - if (!(mp->m_flags & XFS_MOUNT_ALLOCSIZE) && - (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))) + /* + * As an exception we don't do any preallocation at all if the file is + * smaller than the minimum preallocation and we are using the default + * dynamic preallocation scheme, as it is likely this is the only write + * to the file that is going to be done. + */ + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks)) return 0; /* - * If an explicit allocsize is set, the file is small, or we - * are writing behind a hole, then use the minimum prealloc: + * Use the minimum preallocation size for small files or if we are + * writing right after a hole. */ - if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) || - XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || - !xfs_iext_peek_prev_extent(ifp, icur, &prev) || + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || + !xfs_iext_prev_extent(ifp, &ncur, &prev) || prev.br_startoff + prev.br_blockcount < offset_fsb) return mp->m_allocsize_blocks; /* - * Determine the initial size of the preallocation. We are beyond the - * current EOF here, but we need to take into account whether this is - * a sparse write or an extending write when determining the - * preallocation size. Hence we need to look up the extent that ends - * at the current write offset and use the result to determine the - * preallocation size. - * - * If the extent is a hole, then preallocation is essentially disabled. - * Otherwise we take the size of the preceding data extent as the basis - * for the preallocation size. If the size of the extent is greater than - * half the maximum extent length, then use the current offset as the - * basis. This ensures that for large files the preallocation size - * always extends to MAXEXTLEN rather than falling short due to things - * like stripe unit/width alignment of real extents. + * Take the size of the preceding data extents as the basis for the + * preallocation size. Note that we don't care if the previous extents + * are written or not. */ - if (prev.br_blockcount <= (MAXEXTLEN >> 1)) - alloc_blocks = prev.br_blockcount << 1; - else + plen = prev.br_blockcount; + while (xfs_iext_prev_extent(ifp, &ncur, &got)) { + if (plen > MAXEXTLEN / 2 || + isnullstartblock(got.br_startblock) || + got.br_startoff + got.br_blockcount != prev.br_startoff || + got.br_startblock + got.br_blockcount != prev.br_startblock) + break; + plen += got.br_blockcount; + prev = got; + } + + /* + * If the size of the extents is greater than half the maximum extent + * length, then use the current offset as the basis. This ensures that + * for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width + * alignment of real extents. + */ + alloc_blocks = plen * 2; + if (alloc_blocks > MAXEXTLEN) alloc_blocks = XFS_B_TO_FSB(mp, offset); - if (!alloc_blocks) - goto check_writeio; qblocks = alloc_blocks; /* * MAXEXTLEN is not a power of two value but we round the prealloc down * to the nearest power of two value after throttling. To prevent the - * round down from unconditionally reducing the maximum supported prealloc - * size, we round up first, apply appropriate throttling, round down and - * cap the value to MAXEXTLEN. + * round down from unconditionally reducing the maximum supported + * prealloc size, we round up first, apply appropriate throttling, + * round down and cap the value to MAXEXTLEN. */ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), alloc_blocks); @@ -494,7 +489,6 @@ xfs_iomap_prealloc_size( */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; -check_writeio: if (alloc_blocks < mp->m_allocsize_blocks) alloc_blocks = mp->m_allocsize_blocks; trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, @@ -563,7 +557,7 @@ xfs_iomap_write_unwritten( xfs_trans_ijoin(tp, ip, 0); error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS); + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES); if (error) goto error_on_bmapi_transaction; @@ -856,7 +850,7 @@ xfs_buffered_write_iomap_begin( xfs_ilock(ip, XFS_ILOCK_EXCL); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, XFS_DATA_FORK)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { error = -EFSCORRUPTED; goto out_unlock; @@ -961,9 +955,16 @@ xfs_buffered_write_iomap_begin( if (error) goto out_unlock; - if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, offset, - count, &icur); + if (eof && offset + count > XFS_ISIZE(ip)) { + /* + * Determine the initial size of the preallocation. + * We clean up any extra preallocation when the file is closed. + */ + if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + prealloc_blocks = mp->m_allocsize_blocks; + else + prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, + offset, count, &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -1258,12 +1259,12 @@ xfs_xattr_iomap_begin( lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) { error = -ENOENT; goto out_unlock; } - ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); + ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f7a99b3bbcf7..d66528fa3657 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -738,12 +738,7 @@ xfs_setattr_nonsize( if (error) /* out of quota */ goto out_cancel; } - } - /* - * Change file ownership. Must be the owner or privileged. - */ - if (mask & (ATTR_UID|ATTR_GID)) { /* * CAP_FSETID overrides the following restrictions: * @@ -877,7 +872,7 @@ xfs_setattr_size( /* * Short circuit the truncate case for zero length files. */ - if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { + if (newsize == 0 && oldsize == 0 && ip->i_df.if_nextents == 0) { if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) return 0; @@ -1243,13 +1238,12 @@ xfs_inode_supports_dax( { struct xfs_mount *mp = ip->i_mount; - /* Only supported on non-reflinked files. */ - if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip)) + /* Only supported on regular files. */ + if (!S_ISREG(VFS_I(ip)->i_mode)) return false; - /* DAX mount option or DAX iflag must be set. */ - if (!(mp->m_flags & XFS_MOUNT_DAX) && - !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) + /* Only supported on non-reflinked files. */ + if (xfs_is_reflink_inode(ip)) return false; /* Block size must match page size */ @@ -1260,26 +1254,51 @@ xfs_inode_supports_dax( return xfs_inode_buftarg(ip)->bt_daxdev != NULL; } -STATIC void +static bool +xfs_inode_should_enable_dax( + struct xfs_inode *ip) +{ + if (!IS_ENABLED(CONFIG_FS_DAX)) + return false; + if (ip->i_mount->m_flags & XFS_MOUNT_DAX_NEVER) + return false; + if (!xfs_inode_supports_dax(ip)) + return false; + if (ip->i_mount->m_flags & XFS_MOUNT_DAX_ALWAYS) + return true; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + return true; + return false; +} + +void xfs_diflags_to_iflags( - struct inode *inode, - struct xfs_inode *ip) + struct xfs_inode *ip, + bool init) { - uint16_t flags = ip->i_d.di_flags; - - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | - S_NOATIME | S_DAX); - - if (flags & XFS_DIFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - if (flags & XFS_DIFLAG_APPEND) - inode->i_flags |= S_APPEND; - if (flags & XFS_DIFLAG_SYNC) - inode->i_flags |= S_SYNC; - if (flags & XFS_DIFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - if (xfs_inode_supports_dax(ip)) - inode->i_flags |= S_DAX; + struct inode *inode = VFS_I(ip); + unsigned int xflags = xfs_ip2xflags(ip); + unsigned int flags = 0; + + ASSERT(!(IS_DAX(inode) && init)); + + if (xflags & FS_XFLAG_IMMUTABLE) + flags |= S_IMMUTABLE; + if (xflags & FS_XFLAG_APPEND) + flags |= S_APPEND; + if (xflags & FS_XFLAG_SYNC) + flags |= S_SYNC; + if (xflags & FS_XFLAG_NOATIME) + flags |= S_NOATIME; + if (init && xfs_inode_should_enable_dax(ip)) + flags |= S_DAX; + + /* + * S_DAX can only be set during inode initialization and is never set by + * the VFS, so we cannot mask off S_DAX in i_flags. + */ + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME); + inode->i_flags |= flags; } /* @@ -1305,7 +1324,7 @@ xfs_setup_inode( inode_fake_hash(inode); i_size_write(inode, ip->i_d.di_size); - xfs_diflags_to_iflags(inode, ip); + xfs_diflags_to_iflags(ip, true); if (S_ISDIR(inode->i_mode)) { /* diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ff2da28fed90..16ca97a7ff00 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -104,9 +104,9 @@ xfs_bulkstat_one_int( buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize_blks = dic->di_extsize; - buf->bs_extents = dic->di_nextents; + buf->bs_extents = xfs_ifork_nextents(&ip->i_df); xfs_bulkstat_health(ip, buf); - buf->bs_aextents = dic->di_anextents; + buf->bs_aextents = xfs_ifork_nextents(ip->i_afp); buf->bs_forkoff = XFS_IFORK_BOFF(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; @@ -115,7 +115,7 @@ xfs_bulkstat_one_int( buf->bs_cowextsize_blks = dic->di_cowextsize; } - switch (dic->di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_DEV: buf->bs_rdev = sysv_encode_dev(inode->i_rdev); buf->bs_blksize = BLKDEV_IOSIZE; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 11c3502b07b1..ec015df55b77 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -18,21 +18,13 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" -#include "xfs_inode_item.h" -#include "xfs_extfree_item.h" #include "xfs_trans_priv.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" -#include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_bmap_btree.h" #include "xfs_error.h" -#include "xfs_dir2.h" -#include "xfs_rmap_item.h" #include "xfs_buf_item.h" -#include "xfs_refcount_item.h" -#include "xfs_bmap_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -56,17 +48,6 @@ xlog_do_recovery_pass( struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); /* - * This structure is used during recovery to record the buf log items which - * have been canceled and should not be replayed. - */ -struct xfs_buf_cancel { - xfs_daddr_t bc_blkno; - uint bc_len; - int bc_refcount; - struct list_head bc_list; -}; - -/* * Sector aligned buffer routines for buffer create/read/write/access */ @@ -284,7 +265,7 @@ xlog_header_check_mount( return 0; } -STATIC void +void xlog_recover_iodone( struct xfs_buf *bp) { @@ -1779,12 +1760,72 @@ xlog_clear_stale_blocks( return 0; } +/* + * Release the recovered intent item in the AIL that matches the given intent + * type and intent id. + */ +void +xlog_recover_release_intent( + struct xlog *log, + unsigned short intent_type, + uint64_t intent_id) +{ + struct xfs_ail_cursor cur; + struct xfs_log_item *lip; + struct xfs_ail *ailp = log->l_ailp; + + spin_lock(&ailp->ail_lock); + for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; + lip = xfs_trans_ail_cursor_next(ailp, &cur)) { + if (lip->li_type != intent_type) + continue; + if (!lip->li_ops->iop_match(lip, intent_id)) + continue; + + spin_unlock(&ailp->ail_lock); + lip->li_ops->iop_release(lip); + spin_lock(&ailp->ail_lock); + break; + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->ail_lock); +} + /****************************************************************************** * * Log recover routines * ****************************************************************************** */ +static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { + &xlog_buf_item_ops, + &xlog_inode_item_ops, + &xlog_dquot_item_ops, + &xlog_quotaoff_item_ops, + &xlog_icreate_item_ops, + &xlog_efi_item_ops, + &xlog_efd_item_ops, + &xlog_rui_item_ops, + &xlog_rud_item_ops, + &xlog_cui_item_ops, + &xlog_cud_item_ops, + &xlog_bui_item_ops, + &xlog_bud_item_ops, +}; + +static const struct xlog_recover_item_ops * +xlog_find_item_ops( + struct xlog_recover_item *item) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++) + if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type) + return xlog_recover_item_ops[i]; + + return NULL; +} /* * Sort the log items in the transaction. @@ -1841,54 +1882,23 @@ xlog_recover_reorder_trans( struct xlog_recover *trans, int pass) { - xlog_recover_item_t *item, *n; + struct xlog_recover_item *item, *n; int error = 0; LIST_HEAD(sort_list); LIST_HEAD(cancel_list); LIST_HEAD(buffer_list); LIST_HEAD(inode_buffer_list); - LIST_HEAD(inode_list); + LIST_HEAD(item_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { - xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST; - switch (ITEM_TYPE(item)) { - case XFS_LI_ICREATE: - list_move_tail(&item->ri_list, &buffer_list); - break; - case XFS_LI_BUF: - if (buf_f->blf_flags & XFS_BLF_CANCEL) { - trace_xfs_log_recover_item_reorder_head(log, - trans, item, pass); - list_move(&item->ri_list, &cancel_list); - break; - } - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { - list_move(&item->ri_list, &inode_buffer_list); - break; - } - list_move_tail(&item->ri_list, &buffer_list); - break; - case XFS_LI_INODE: - case XFS_LI_DQUOT: - case XFS_LI_QUOTAOFF: - case XFS_LI_EFD: - case XFS_LI_EFI: - case XFS_LI_RUI: - case XFS_LI_RUD: - case XFS_LI_CUI: - case XFS_LI_CUD: - case XFS_LI_BUI: - case XFS_LI_BUD: - trace_xfs_log_recover_item_reorder_tail(log, - trans, item, pass); - list_move_tail(&item->ri_list, &inode_list); - break; - default: + item->ri_ops = xlog_find_item_ops(item); + if (!item->ri_ops) { xfs_warn(log->l_mp, - "%s: unrecognized type of log operation", - __func__); + "%s: unrecognized type of log operation (%d)", + __func__, ITEM_TYPE(item)); ASSERT(0); /* * return the remaining items back to the transaction @@ -1896,16 +1906,38 @@ xlog_recover_reorder_trans( */ if (!list_empty(&sort_list)) list_splice_init(&sort_list, &trans->r_itemq); - error = -EIO; - goto out; + error = -EFSCORRUPTED; + break; + } + + if (item->ri_ops->reorder) + fate = item->ri_ops->reorder(item); + + switch (fate) { + case XLOG_REORDER_BUFFER_LIST: + list_move_tail(&item->ri_list, &buffer_list); + break; + case XLOG_REORDER_CANCEL_LIST: + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); + list_move(&item->ri_list, &cancel_list); + break; + case XLOG_REORDER_INODE_BUFFER_LIST: + list_move(&item->ri_list, &inode_buffer_list); + break; + case XLOG_REORDER_ITEM_LIST: + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); + list_move_tail(&item->ri_list, &item_list); + break; } } -out: + ASSERT(list_empty(&sort_list)); if (!list_empty(&buffer_list)) list_splice(&buffer_list, &trans->r_itemq); - if (!list_empty(&inode_list)) - list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&item_list)) + list_splice_tail(&item_list, &trans->r_itemq); if (!list_empty(&inode_buffer_list)) list_splice_tail(&inode_buffer_list, &trans->r_itemq); if (!list_empty(&cancel_list)) @@ -1913,2152 +1945,15 @@ out: return error; } -/* - * Build up the table of buf cancel records so that we don't replay - * cancelled data in the second pass. For buffer records that are - * not cancel records, there is nothing to do here so we just return. - * - * If we get a cancel record which is already in the table, this indicates - * that the buffer was cancelled multiple times. In order to ensure - * that during pass 2 we keep the record in the table until we reach its - * last occurrence in the log, we keep a reference count in the cancel - * record in the table to tell us how many times we expect to see this - * record during the second pass. - */ -STATIC int -xlog_recover_buffer_pass1( - struct xlog *log, - struct xlog_recover_item *item) -{ - xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; - struct list_head *bucket; - struct xfs_buf_cancel *bcp; - - if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { - xfs_err(log->l_mp, "bad buffer log item size (%d)", - item->ri_buf[0].i_len); - return -EFSCORRUPTED; - } - - /* - * If this isn't a cancel buffer item, then just return. - */ - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { - trace_xfs_log_recover_buf_not_cancel(log, buf_f); - return 0; - } - - /* - * Insert an xfs_buf_cancel record into the hash table of them. - * If there is already an identical record, bump its reference count. - */ - bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); - list_for_each_entry(bcp, bucket, bc_list) { - if (bcp->bc_blkno == buf_f->blf_blkno && - bcp->bc_len == buf_f->blf_len) { - bcp->bc_refcount++; - trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); - return 0; - } - } - - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); - bcp->bc_blkno = buf_f->blf_blkno; - bcp->bc_len = buf_f->blf_len; - bcp->bc_refcount = 1; - list_add_tail(&bcp->bc_list, bucket); - - trace_xfs_log_recover_buf_cancel_add(log, buf_f); - return 0; -} - -/* - * Check to see whether the buffer being recovered has a corresponding - * entry in the buffer cancel record table. If it is, return the cancel - * buffer structure to the caller. - */ -STATIC struct xfs_buf_cancel * -xlog_peek_buffer_cancelled( - struct xlog *log, - xfs_daddr_t blkno, - uint len, - unsigned short flags) -{ - struct list_head *bucket; - struct xfs_buf_cancel *bcp; - - if (!log->l_buf_cancel_table) { - /* empty table means no cancelled buffers in the log */ - ASSERT(!(flags & XFS_BLF_CANCEL)); - return NULL; - } - - bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); - list_for_each_entry(bcp, bucket, bc_list) { - if (bcp->bc_blkno == blkno && bcp->bc_len == len) - return bcp; - } - - /* - * We didn't find a corresponding entry in the table, so return 0 so - * that the buffer is NOT cancelled. - */ - ASSERT(!(flags & XFS_BLF_CANCEL)); - return NULL; -} - -/* - * If the buffer is being cancelled then return 1 so that it will be cancelled, - * otherwise return 0. If the buffer is actually a buffer cancel item - * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the - * table and remove it from the table if this is the last reference. - * - * We remove the cancel record from the table when we encounter its last - * occurrence in the log so that if the same buffer is re-used again after its - * last cancellation we actually replay the changes made at that point. - */ -STATIC int -xlog_check_buffer_cancelled( +void +xlog_buf_readahead( struct xlog *log, xfs_daddr_t blkno, uint len, - unsigned short flags) -{ - struct xfs_buf_cancel *bcp; - - bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); - if (!bcp) - return 0; - - /* - * We've go a match, so return 1 so that the recovery of this buffer - * is cancelled. If this buffer is actually a buffer cancel log - * item, then decrement the refcount on the one in the table and - * remove it if this is the last reference. - */ - if (flags & XFS_BLF_CANCEL) { - if (--bcp->bc_refcount == 0) { - list_del(&bcp->bc_list); - kmem_free(bcp); - } - } - return 1; -} - -/* - * Perform recovery for a buffer full of inodes. In these buffers, the only - * data which should be recovered is that which corresponds to the - * di_next_unlinked pointers in the on disk inode structures. The rest of the - * data for the inodes is always logged through the inodes themselves rather - * than the inode buffer and is recovered in xlog_recover_inode_pass2(). - * - * The only time when buffers full of inodes are fully recovered is when the - * buffer is full of newly allocated inodes. In this case the buffer will - * not be marked as an inode buffer and so will be sent to - * xlog_recover_do_reg_buffer() below during recovery. - */ -STATIC int -xlog_recover_do_inode_buffer( - struct xfs_mount *mp, - xlog_recover_item_t *item, - struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f) -{ - int i; - int item_index = 0; - int bit = 0; - int nbits = 0; - int reg_buf_offset = 0; - int reg_buf_bytes = 0; - int next_unlinked_offset; - int inodes_per_buf; - xfs_agino_t *logged_nextp; - xfs_agino_t *buffer_nextp; - - trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - - /* - * Post recovery validation only works properly on CRC enabled - * filesystems. - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) - bp->b_ops = &xfs_inode_buf_ops; - - inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; - for (i = 0; i < inodes_per_buf; i++) { - next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + - offsetof(xfs_dinode_t, di_next_unlinked); - - while (next_unlinked_offset >= - (reg_buf_offset + reg_buf_bytes)) { - /* - * The next di_next_unlinked field is beyond - * the current logged region. Find the next - * logged region that contains or is beyond - * the current di_next_unlinked field. - */ - bit += nbits; - bit = xfs_next_bit(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - - /* - * If there are no more logged regions in the - * buffer, then we're done. - */ - if (bit == -1) - return 0; - - nbits = xfs_contig_bits(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - ASSERT(nbits > 0); - reg_buf_offset = bit << XFS_BLF_SHIFT; - reg_buf_bytes = nbits << XFS_BLF_SHIFT; - item_index++; - } - - /* - * If the current logged region starts after the current - * di_next_unlinked field, then move on to the next - * di_next_unlinked field. - */ - if (next_unlinked_offset < reg_buf_offset) - continue; - - ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); - - /* - * The current logged region contains a copy of the - * current di_next_unlinked field. Extract its value - * and copy it to the buffer copy. - */ - logged_nextp = item->ri_buf[item_index].i_addr + - next_unlinked_offset - reg_buf_offset; - if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { - xfs_alert(mp, - "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " - "Trying to replay bad (0) inode di_next_unlinked field.", - item, bp); - return -EFSCORRUPTED; - } - - buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); - *buffer_nextp = *logged_nextp; - - /* - * If necessary, recalculate the CRC in the on-disk inode. We - * have to leave the inode in a consistent state for whoever - * reads it next.... - */ - xfs_dinode_calc_crc(mp, - xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); - - } - - return 0; -} - -/* - * V5 filesystems know the age of the buffer on disk being recovered. We can - * have newer objects on disk than we are replaying, and so for these cases we - * don't want to replay the current change as that will make the buffer contents - * temporarily invalid on disk. - * - * The magic number might not match the buffer type we are going to recover - * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence - * extract the LSN of the existing object in the buffer based on it's current - * magic number. If we don't recognise the magic number in the buffer, then - * return a LSN of -1 so that the caller knows it was an unrecognised block and - * so can recover the buffer. - * - * Note: we cannot rely solely on magic number matches to determine that the - * buffer has a valid LSN - we also need to verify that it belongs to this - * filesystem, so we need to extract the object's LSN and compare it to that - * which we read from the superblock. If the UUIDs don't match, then we've got a - * stale metadata block from an old filesystem instance that we need to recover - * over the top of. - */ -static xfs_lsn_t -xlog_recover_get_buf_lsn( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - uint32_t magic32; - uint16_t magic16; - uint16_t magicda; - void *blk = bp->b_addr; - uuid_t *uuid; - xfs_lsn_t lsn = -1; - - /* v4 filesystems always recover immediately */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) - goto recover_immediately; - - magic32 = be32_to_cpu(*(__be32 *)blk); - switch (magic32) { - case XFS_ABTB_CRC_MAGIC: - case XFS_ABTC_CRC_MAGIC: - case XFS_ABTB_MAGIC: - case XFS_ABTC_MAGIC: - case XFS_RMAP_CRC_MAGIC: - case XFS_REFC_CRC_MAGIC: - case XFS_IBT_CRC_MAGIC: - case XFS_IBT_MAGIC: { - struct xfs_btree_block *btb = blk; - - lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); - uuid = &btb->bb_u.s.bb_uuid; - break; - } - case XFS_BMAP_CRC_MAGIC: - case XFS_BMAP_MAGIC: { - struct xfs_btree_block *btb = blk; - - lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); - uuid = &btb->bb_u.l.bb_uuid; - break; - } - case XFS_AGF_MAGIC: - lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); - uuid = &((struct xfs_agf *)blk)->agf_uuid; - break; - case XFS_AGFL_MAGIC: - lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); - uuid = &((struct xfs_agfl *)blk)->agfl_uuid; - break; - case XFS_AGI_MAGIC: - lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); - uuid = &((struct xfs_agi *)blk)->agi_uuid; - break; - case XFS_SYMLINK_MAGIC: - lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); - uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; - break; - case XFS_DIR3_BLOCK_MAGIC: - case XFS_DIR3_DATA_MAGIC: - case XFS_DIR3_FREE_MAGIC: - lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); - uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; - break; - case XFS_ATTR3_RMT_MAGIC: - /* - * Remote attr blocks are written synchronously, rather than - * being logged. That means they do not contain a valid LSN - * (i.e. transactionally ordered) in them, and hence any time we - * see a buffer to replay over the top of a remote attribute - * block we should simply do so. - */ - goto recover_immediately; - case XFS_SB_MAGIC: - /* - * superblock uuids are magic. We may or may not have a - * sb_meta_uuid on disk, but it will be set in the in-core - * superblock. We set the uuid pointer for verification - * according to the superblock feature mask to ensure we check - * the relevant UUID in the superblock. - */ - lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); - if (xfs_sb_version_hasmetauuid(&mp->m_sb)) - uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; - else - uuid = &((struct xfs_dsb *)blk)->sb_uuid; - break; - default: - break; - } - - if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) - goto recover_immediately; - return lsn; - } - - magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); - switch (magicda) { - case XFS_DIR3_LEAF1_MAGIC: - case XFS_DIR3_LEAFN_MAGIC: - case XFS_DA3_NODE_MAGIC: - lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); - uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; - break; - default: - break; - } - - if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) - goto recover_immediately; - return lsn; - } - - /* - * We do individual object checks on dquot and inode buffers as they - * have their own individual LSN records. Also, we could have a stale - * buffer here, so we have to at least recognise these buffer types. - * - * A notd complexity here is inode unlinked list processing - it logs - * the inode directly in the buffer, but we don't know which inodes have - * been modified, and there is no global buffer LSN. Hence we need to - * recover all inode buffer types immediately. This problem will be - * fixed by logical logging of the unlinked list modifications. - */ - magic16 = be16_to_cpu(*(__be16 *)blk); - switch (magic16) { - case XFS_DQUOT_MAGIC: - case XFS_DINODE_MAGIC: - goto recover_immediately; - default: - break; - } - - /* unknown buffer contents, recover immediately */ - -recover_immediately: - return (xfs_lsn_t)-1; - -} - -/* - * Validate the recovered buffer is of the correct type and attach the - * appropriate buffer operations to them for writeback. Magic numbers are in a - * few places: - * the first 16 bits of the buffer (inode buffer, dquot buffer), - * the first 32 bits of the buffer (most blocks), - * inside a struct xfs_da_blkinfo at the start of the buffer. - */ -static void -xlog_recover_validate_buf_type( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f, - xfs_lsn_t current_lsn) -{ - struct xfs_da_blkinfo *info = bp->b_addr; - uint32_t magic32; - uint16_t magic16; - uint16_t magicda; - char *warnmsg = NULL; - - /* - * We can only do post recovery validation on items on CRC enabled - * fielsystems as we need to know when the buffer was written to be able - * to determine if we should have replayed the item. If we replay old - * metadata over a newer buffer, then it will enter a temporarily - * inconsistent state resulting in verification failures. Hence for now - * just avoid the verification stage for non-crc filesystems - */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); - magic16 = be16_to_cpu(*(__be16*)bp->b_addr); - magicda = be16_to_cpu(info->magic); - switch (xfs_blft_from_flags(buf_f)) { - case XFS_BLFT_BTREE_BUF: - switch (magic32) { - case XFS_ABTB_CRC_MAGIC: - case XFS_ABTB_MAGIC: - bp->b_ops = &xfs_bnobt_buf_ops; - break; - case XFS_ABTC_CRC_MAGIC: - case XFS_ABTC_MAGIC: - bp->b_ops = &xfs_cntbt_buf_ops; - break; - case XFS_IBT_CRC_MAGIC: - case XFS_IBT_MAGIC: - bp->b_ops = &xfs_inobt_buf_ops; - break; - case XFS_FIBT_CRC_MAGIC: - case XFS_FIBT_MAGIC: - bp->b_ops = &xfs_finobt_buf_ops; - break; - case XFS_BMAP_CRC_MAGIC: - case XFS_BMAP_MAGIC: - bp->b_ops = &xfs_bmbt_buf_ops; - break; - case XFS_RMAP_CRC_MAGIC: - bp->b_ops = &xfs_rmapbt_buf_ops; - break; - case XFS_REFC_CRC_MAGIC: - bp->b_ops = &xfs_refcountbt_buf_ops; - break; - default: - warnmsg = "Bad btree block magic!"; - break; - } - break; - case XFS_BLFT_AGF_BUF: - if (magic32 != XFS_AGF_MAGIC) { - warnmsg = "Bad AGF block magic!"; - break; - } - bp->b_ops = &xfs_agf_buf_ops; - break; - case XFS_BLFT_AGFL_BUF: - if (magic32 != XFS_AGFL_MAGIC) { - warnmsg = "Bad AGFL block magic!"; - break; - } - bp->b_ops = &xfs_agfl_buf_ops; - break; - case XFS_BLFT_AGI_BUF: - if (magic32 != XFS_AGI_MAGIC) { - warnmsg = "Bad AGI block magic!"; - break; - } - bp->b_ops = &xfs_agi_buf_ops; - break; - case XFS_BLFT_UDQUOT_BUF: - case XFS_BLFT_PDQUOT_BUF: - case XFS_BLFT_GDQUOT_BUF: -#ifdef CONFIG_XFS_QUOTA - if (magic16 != XFS_DQUOT_MAGIC) { - warnmsg = "Bad DQUOT block magic!"; - break; - } - bp->b_ops = &xfs_dquot_buf_ops; -#else - xfs_alert(mp, - "Trying to recover dquots without QUOTA support built in!"); - ASSERT(0); -#endif - break; - case XFS_BLFT_DINO_BUF: - if (magic16 != XFS_DINODE_MAGIC) { - warnmsg = "Bad INODE block magic!"; - break; - } - bp->b_ops = &xfs_inode_buf_ops; - break; - case XFS_BLFT_SYMLINK_BUF: - if (magic32 != XFS_SYMLINK_MAGIC) { - warnmsg = "Bad symlink block magic!"; - break; - } - bp->b_ops = &xfs_symlink_buf_ops; - break; - case XFS_BLFT_DIR_BLOCK_BUF: - if (magic32 != XFS_DIR2_BLOCK_MAGIC && - magic32 != XFS_DIR3_BLOCK_MAGIC) { - warnmsg = "Bad dir block magic!"; - break; - } - bp->b_ops = &xfs_dir3_block_buf_ops; - break; - case XFS_BLFT_DIR_DATA_BUF: - if (magic32 != XFS_DIR2_DATA_MAGIC && - magic32 != XFS_DIR3_DATA_MAGIC) { - warnmsg = "Bad dir data magic!"; - break; - } - bp->b_ops = &xfs_dir3_data_buf_ops; - break; - case XFS_BLFT_DIR_FREE_BUF: - if (magic32 != XFS_DIR2_FREE_MAGIC && - magic32 != XFS_DIR3_FREE_MAGIC) { - warnmsg = "Bad dir3 free magic!"; - break; - } - bp->b_ops = &xfs_dir3_free_buf_ops; - break; - case XFS_BLFT_DIR_LEAF1_BUF: - if (magicda != XFS_DIR2_LEAF1_MAGIC && - magicda != XFS_DIR3_LEAF1_MAGIC) { - warnmsg = "Bad dir leaf1 magic!"; - break; - } - bp->b_ops = &xfs_dir3_leaf1_buf_ops; - break; - case XFS_BLFT_DIR_LEAFN_BUF: - if (magicda != XFS_DIR2_LEAFN_MAGIC && - magicda != XFS_DIR3_LEAFN_MAGIC) { - warnmsg = "Bad dir leafn magic!"; - break; - } - bp->b_ops = &xfs_dir3_leafn_buf_ops; - break; - case XFS_BLFT_DA_NODE_BUF: - if (magicda != XFS_DA_NODE_MAGIC && - magicda != XFS_DA3_NODE_MAGIC) { - warnmsg = "Bad da node magic!"; - break; - } - bp->b_ops = &xfs_da3_node_buf_ops; - break; - case XFS_BLFT_ATTR_LEAF_BUF: - if (magicda != XFS_ATTR_LEAF_MAGIC && - magicda != XFS_ATTR3_LEAF_MAGIC) { - warnmsg = "Bad attr leaf magic!"; - break; - } - bp->b_ops = &xfs_attr3_leaf_buf_ops; - break; - case XFS_BLFT_ATTR_RMT_BUF: - if (magic32 != XFS_ATTR3_RMT_MAGIC) { - warnmsg = "Bad attr remote magic!"; - break; - } - bp->b_ops = &xfs_attr3_rmt_buf_ops; - break; - case XFS_BLFT_SB_BUF: - if (magic32 != XFS_SB_MAGIC) { - warnmsg = "Bad SB block magic!"; - break; - } - bp->b_ops = &xfs_sb_buf_ops; - break; -#ifdef CONFIG_XFS_RT - case XFS_BLFT_RTBITMAP_BUF: - case XFS_BLFT_RTSUMMARY_BUF: - /* no magic numbers for verification of RT buffers */ - bp->b_ops = &xfs_rtbuf_ops; - break; -#endif /* CONFIG_XFS_RT */ - default: - xfs_warn(mp, "Unknown buffer type %d!", - xfs_blft_from_flags(buf_f)); - break; - } - - /* - * Nothing else to do in the case of a NULL current LSN as this means - * the buffer is more recent than the change in the log and will be - * skipped. - */ - if (current_lsn == NULLCOMMITLSN) - return; - - if (warnmsg) { - xfs_warn(mp, warnmsg); - ASSERT(0); - } - - /* - * We must update the metadata LSN of the buffer as it is written out to - * ensure that older transactions never replay over this one and corrupt - * the buffer. This can occur if log recovery is interrupted at some - * point after the current transaction completes, at which point a - * subsequent mount starts recovery from the beginning. - * - * Write verifiers update the metadata LSN from log items attached to - * the buffer. Therefore, initialize a bli purely to carry the LSN to - * the verifier. We'll clean it up in our ->iodone() callback. - */ - if (bp->b_ops) { - struct xfs_buf_log_item *bip; - - ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_item_init(bp, mp); - bip = bp->b_log_item; - bip->bli_item.li_lsn = current_lsn; - } -} - -/* - * Perform a 'normal' buffer recovery. Each logged region of the - * buffer should be copied over the corresponding region in the - * given buffer. The bitmap in the buf log format structure indicates - * where to place the logged data. - */ -STATIC void -xlog_recover_do_reg_buffer( - struct xfs_mount *mp, - xlog_recover_item_t *item, - struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f, - xfs_lsn_t current_lsn) -{ - int i; - int bit; - int nbits; - xfs_failaddr_t fa; - const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); - - trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); - - bit = 0; - i = 1; /* 0 is the buf format structure */ - while (1) { - bit = xfs_next_bit(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - if (bit == -1) - break; - nbits = xfs_contig_bits(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - ASSERT(nbits > 0); - ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); - ASSERT(BBTOB(bp->b_length) >= - ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); - - /* - * The dirty regions logged in the buffer, even though - * contiguous, may span multiple chunks. This is because the - * dirty region may span a physical page boundary in a buffer - * and hence be split into two separate vectors for writing into - * the log. Hence we need to trim nbits back to the length of - * the current region being copied out of the log. - */ - if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) - nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; - - /* - * Do a sanity check if this is a dquot buffer. Just checking - * the first dquot in the buffer should do. XXXThis is - * probably a good thing to do for other buf types also. - */ - fa = NULL; - if (buf_f->blf_flags & - (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - if (item->ri_buf[i].i_addr == NULL) { - xfs_alert(mp, - "XFS: NULL dquot in %s.", __func__); - goto next; - } - if (item->ri_buf[i].i_len < size_disk_dquot) { - xfs_alert(mp, - "XFS: dquot too small (%d) in %s.", - item->ri_buf[i].i_len, __func__); - goto next; - } - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, - -1, 0); - if (fa) { - xfs_alert(mp, - "dquot corrupt at %pS trying to replay into block 0x%llx", - fa, bp->b_bn); - goto next; - } - } - - memcpy(xfs_buf_offset(bp, - (uint)bit << XFS_BLF_SHIFT), /* dest */ - item->ri_buf[i].i_addr, /* source */ - nbits<<XFS_BLF_SHIFT); /* length */ - next: - i++; - bit += nbits; - } - - /* Shouldn't be any more regions */ - ASSERT(i == item->ri_total); - - xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); -} - -/* - * Perform a dquot buffer recovery. - * Simple algorithm: if we have found a QUOTAOFF log item of the same type - * (ie. USR or GRP), then just toss this buffer away; don't recover it. - * Else, treat it as a regular buffer and do recovery. - * - * Return false if the buffer was tossed and true if we recovered the buffer to - * indicate to the caller if the buffer needs writing. - */ -STATIC bool -xlog_recover_do_dquot_buffer( - struct xfs_mount *mp, - struct xlog *log, - struct xlog_recover_item *item, - struct xfs_buf *bp, - struct xfs_buf_log_format *buf_f) -{ - uint type; - - trace_xfs_log_recover_buf_dquot_buf(log, buf_f); - - /* - * Filesystems are required to send in quota flags at mount time. - */ - if (!mp->m_qflags) - return false; - - type = 0; - if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) - type |= XFS_DQ_USER; - if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) - type |= XFS_DQ_PROJ; - if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) - type |= XFS_DQ_GROUP; - /* - * This type of quotas was turned off, so ignore this buffer - */ - if (log->l_quotaoffs_flag & type) - return false; - - xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); - return true; -} - -/* - * This routine replays a modification made to a buffer at runtime. - * There are actually two types of buffer, regular and inode, which - * are handled differently. Inode buffers are handled differently - * in that we only recover a specific set of data from them, namely - * the inode di_next_unlinked fields. This is because all other inode - * data is actually logged via inode records and any data we replay - * here which overlaps that may be stale. - * - * When meta-data buffers are freed at run time we log a buffer item - * with the XFS_BLF_CANCEL bit set to indicate that previous copies - * of the buffer in the log should not be replayed at recovery time. - * This is so that if the blocks covered by the buffer are reused for - * file data before we crash we don't end up replaying old, freed - * meta-data into a user's file. - * - * To handle the cancellation of buffer log items, we make two passes - * over the log during recovery. During the first we build a table of - * those buffers which have been cancelled, and during the second we - * only replay those buffers which do not have corresponding cancel - * records in the table. See xlog_recover_buffer_pass[1,2] above - * for more details on the implementation of the table of cancel records. - */ -STATIC int -xlog_recover_buffer_pass2( - struct xlog *log, - struct list_head *buffer_list, - struct xlog_recover_item *item, - xfs_lsn_t current_lsn) -{ - xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; - xfs_mount_t *mp = log->l_mp; - xfs_buf_t *bp; - int error; - uint buf_flags; - xfs_lsn_t lsn; - - /* - * In this pass we only want to recover all the buffers which have - * not been cancelled and are not cancellation buffers themselves. - */ - if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, - buf_f->blf_len, buf_f->blf_flags)) { - trace_xfs_log_recover_buf_cancel(log, buf_f); - return 0; - } - - trace_xfs_log_recover_buf_recover(log, buf_f); - - buf_flags = 0; - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) - buf_flags |= XBF_UNMAPPED; - - error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, &bp, NULL); - if (error) - return error; - - /* - * Recover the buffer only if we get an LSN from it and it's less than - * the lsn of the transaction we are replaying. - * - * Note that we have to be extremely careful of readahead here. - * Readahead does not attach verfiers to the buffers so if we don't - * actually do any replay after readahead because of the LSN we found - * in the buffer if more recent than that current transaction then we - * need to attach the verifier directly. Failure to do so can lead to - * future recovery actions (e.g. EFI and unlinked list recovery) can - * operate on the buffers and they won't get the verifier attached. This - * can lead to blocks on disk having the correct content but a stale - * CRC. - * - * It is safe to assume these clean buffers are currently up to date. - * If the buffer is dirtied by a later transaction being replayed, then - * the verifier will be reset to match whatever recover turns that - * buffer into. - */ - lsn = xlog_recover_get_buf_lsn(mp, bp); - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - trace_xfs_log_recover_buf_skip(log, buf_f); - xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); - goto out_release; - } - - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { - error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - if (error) - goto out_release; - } else if (buf_f->blf_flags & - (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - bool dirty; - - dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); - if (!dirty) - goto out_release; - } else { - xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); - } - - /* - * Perform delayed write on the buffer. Asynchronous writes will be - * slower when taking into account all the buffers to be flushed. - * - * Also make sure that only inode buffers with good sizes stay in - * the buffer cache. The kernel moves inodes in buffers of 1 block - * or inode_cluster_size bytes, whichever is bigger. The inode - * buffers in the log can be a different size if the log was generated - * by an older kernel using unclustered inode buffers or a newer kernel - * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't max(blocksize, inode_cluster_size) - * for *our* value of inode_cluster_size, then we need to keep - * the buffer out of the buffer cache so that the buffer won't - * overlap with future reads of those inodes. - */ - if (XFS_DINODE_MAGIC == - be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { - xfs_buf_stale(bp); - error = xfs_bwrite(bp); - } else { - ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp, buffer_list); - } - -out_release: - xfs_buf_relse(bp); - return error; -} - -/* - * Inode fork owner changes - * - * If we have been told that we have to reparent the inode fork, it's because an - * extent swap operation on a CRC enabled filesystem has been done and we are - * replaying it. We need to walk the BMBT of the appropriate fork and change the - * owners of it. - * - * The complexity here is that we don't have an inode context to work with, so - * after we've replayed the inode we need to instantiate one. This is where the - * fun begins. - * - * We are in the middle of log recovery, so we can't run transactions. That - * means we cannot use cache coherent inode instantiation via xfs_iget(), as - * that will result in the corresponding iput() running the inode through - * xfs_inactive(). If we've just replayed an inode core that changes the link - * count to zero (i.e. it's been unlinked), then xfs_inactive() will run - * transactions (bad!). - * - * So, to avoid this, we instantiate an inode directly from the inode core we've - * just recovered. We have the buffer still locked, and all we really need to - * instantiate is the inode core and the forks being modified. We can do this - * manually, then run the inode btree owner change, and then tear down the - * xfs_inode without having to run any transactions at all. - * - * Also, because we don't have a transaction context available here but need to - * gather all the buffers we modify for writeback so we pass the buffer_list - * instead for the operation to use. - */ - -STATIC int -xfs_recover_inode_owner_change( - struct xfs_mount *mp, - struct xfs_dinode *dip, - struct xfs_inode_log_format *in_f, - struct list_head *buffer_list) + const struct xfs_buf_ops *ops) { - struct xfs_inode *ip; - int error; - - ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); - - ip = xfs_inode_alloc(mp, in_f->ilf_ino); - if (!ip) - return -ENOMEM; - - /* instantiate the inode */ - ASSERT(dip->di_version >= 3); - xfs_inode_from_disk(ip, dip); - - error = xfs_iformat_fork(ip, dip); - if (error) - goto out_free_ip; - - if (!xfs_inode_verify_forks(ip)) { - error = -EFSCORRUPTED; - goto out_free_ip; - } - - if (in_f->ilf_fields & XFS_ILOG_DOWNER) { - ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); - error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, - ip->i_ino, buffer_list); - if (error) - goto out_free_ip; - } - - if (in_f->ilf_fields & XFS_ILOG_AOWNER) { - ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); - error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, - ip->i_ino, buffer_list); - if (error) - goto out_free_ip; - } - -out_free_ip: - xfs_inode_free(ip); - return error; -} - -STATIC int -xlog_recover_inode_pass2( - struct xlog *log, - struct list_head *buffer_list, - struct xlog_recover_item *item, - xfs_lsn_t current_lsn) -{ - struct xfs_inode_log_format *in_f; - xfs_mount_t *mp = log->l_mp; - xfs_buf_t *bp; - xfs_dinode_t *dip; - int len; - char *src; - char *dest; - int error; - int attr_index; - uint fields; - struct xfs_log_dinode *ldip; - uint isize; - int need_free = 0; - - if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { - in_f = item->ri_buf[0].i_addr; - } else { - in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); - need_free = 1; - error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); - if (error) - goto error; - } - - /* - * Inode buffers can be freed, look out for it, - * and do not replay the inode. - */ - if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, - in_f->ilf_len, 0)) { - error = 0; - trace_xfs_log_recover_inode_cancel(log, in_f); - goto error; - } - trace_xfs_log_recover_inode_recover(log, in_f); - - error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - 0, &bp, &xfs_inode_buf_ops); - if (error) - goto error; - ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); - dip = xfs_buf_offset(bp, in_f->ilf_boffset); - - /* - * Make sure the place we're flushing out to really looks - * like an inode! - */ - if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { - xfs_alert(mp, - "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", - __func__, dip, bp, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - ldip = item->ri_buf[1].i_addr; - if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", - __func__, item, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - - /* - * If the inode has an LSN in it, recover the inode only if it's less - * than the lsn of the transaction we are replaying. Note: we still - * need to replay an owner change even though the inode is more recent - * than the transaction as there is no guarantee that all the btree - * blocks are more recent than this transaction, too. - */ - if (dip->di_version >= 3) { - xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); - - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - trace_xfs_log_recover_inode_skip(log, in_f); - error = 0; - goto out_owner_change; - } - } - - /* - * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes - * are transactional and if ordering is necessary we can determine that - * more accurately by the LSN field in the V3 inode core. Don't trust - * the inode versions we might be changing them here - use the - * superblock flag to determine whether we need to look at di_flushiter - * to skip replay when the on disk inode is newer than the log one - */ - if (!xfs_sb_version_has_v3inode(&mp->m_sb) && - ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { - /* - * Deal with the wrap case, DI_MAX_FLUSH is less - * than smaller numbers - */ - if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && - ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { - /* do nothing */ - } else { - trace_xfs_log_recover_inode_skip(log, in_f); - error = 0; - goto out_release; - } - } - - /* Take the opportunity to reset the flush iteration count */ - ldip->di_flushiter = 0; - - if (unlikely(S_ISREG(ldip->di_mode))) { - if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && - (ldip->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad regular inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - } else if (unlikely(S_ISDIR(ldip->di_mode))) { - if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && - (ldip->di_format != XFS_DINODE_FMT_BTREE) && - (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad dir inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - } - if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", - __func__, item, dip, bp, in_f->ilf_ino, - ldip->di_nextents + ldip->di_anextents, - ldip->di_nblocks); - error = -EFSCORRUPTED; - goto out_release; - } - if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, - item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); - error = -EFSCORRUPTED; - goto out_release; - } - isize = xfs_log_dinode_size(mp); - if (unlikely(item->ri_buf[1].i_len > isize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record length %d, rec ptr "PTR_FMT, - __func__, item->ri_buf[1].i_len, item); - error = -EFSCORRUPTED; - goto out_release; - } - - /* recover the log dinode inode into the on disk inode */ - xfs_log_dinode_to_disk(ldip, dip); - - fields = in_f->ilf_fields; - if (fields & XFS_ILOG_DEV) - xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); - - if (in_f->ilf_size == 2) - goto out_owner_change; - len = item->ri_buf[2].i_len; - src = item->ri_buf[2].i_addr; - ASSERT(in_f->ilf_size <= 4); - ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); - ASSERT(!(fields & XFS_ILOG_DFORK) || - (len == in_f->ilf_dsize)); - - switch (fields & XFS_ILOG_DFORK) { - case XFS_ILOG_DDATA: - case XFS_ILOG_DEXT: - memcpy(XFS_DFORK_DPTR(dip), src, len); - break; - - case XFS_ILOG_DBROOT: - xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, - (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), - XFS_DFORK_DSIZE(dip, mp)); - break; - - default: - /* - * There are no data fork flags set. - */ - ASSERT((fields & XFS_ILOG_DFORK) == 0); - break; - } - - /* - * If we logged any attribute data, recover it. There may or - * may not have been any other non-core data logged in this - * transaction. - */ - if (in_f->ilf_fields & XFS_ILOG_AFORK) { - if (in_f->ilf_fields & XFS_ILOG_DFORK) { - attr_index = 3; - } else { - attr_index = 2; - } - len = item->ri_buf[attr_index].i_len; - src = item->ri_buf[attr_index].i_addr; - ASSERT(len == in_f->ilf_asize); - - switch (in_f->ilf_fields & XFS_ILOG_AFORK) { - case XFS_ILOG_ADATA: - case XFS_ILOG_AEXT: - dest = XFS_DFORK_APTR(dip); - ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); - memcpy(dest, src, len); - break; - - case XFS_ILOG_ABROOT: - dest = XFS_DFORK_APTR(dip); - xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, - len, (xfs_bmdr_block_t*)dest, - XFS_DFORK_ASIZE(dip, mp)); - break; - - default: - xfs_warn(log->l_mp, "%s: Invalid flag", __func__); - ASSERT(0); - error = -EFSCORRUPTED; - goto out_release; - } - } - -out_owner_change: - /* Recover the swapext owner change unless inode has been deleted */ - if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && - (dip->di_mode != 0)) - error = xfs_recover_inode_owner_change(mp, dip, in_f, - buffer_list); - /* re-generate the checksum. */ - xfs_dinode_calc_crc(log->l_mp, dip); - - ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp, buffer_list); - -out_release: - xfs_buf_relse(bp); -error: - if (need_free) - kmem_free(in_f); - return error; -} - -/* - * Recover QUOTAOFF records. We simply make a note of it in the xlog - * structure, so that we know not to do any dquot item or dquot buffer recovery, - * of that type. - */ -STATIC int -xlog_recover_quotaoff_pass1( - struct xlog *log, - struct xlog_recover_item *item) -{ - xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; - ASSERT(qoff_f); - - /* - * The logitem format's flag tells us if this was user quotaoff, - * group/project quotaoff or both. - */ - if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_USER; - if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_PROJ; - if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_GROUP; - - return 0; -} - -/* - * Recover a dquot record - */ -STATIC int -xlog_recover_dquot_pass2( - struct xlog *log, - struct list_head *buffer_list, - struct xlog_recover_item *item, - xfs_lsn_t current_lsn) -{ - xfs_mount_t *mp = log->l_mp; - xfs_buf_t *bp; - struct xfs_disk_dquot *ddq, *recddq; - xfs_failaddr_t fa; - int error; - xfs_dq_logformat_t *dq_f; - uint type; - - - /* - * Filesystems are required to send in quota flags at mount time. - */ - if (mp->m_qflags == 0) - return 0; - - recddq = item->ri_buf[1].i_addr; - if (recddq == NULL) { - xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); - return -EFSCORRUPTED; - } - if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { - xfs_alert(log->l_mp, "dquot too small (%d) in %s.", - item->ri_buf[1].i_len, __func__); - return -EFSCORRUPTED; - } - - /* - * This type of quotas was turned off, so ignore this record. - */ - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); - ASSERT(type); - if (log->l_quotaoffs_flag & type) - return 0; - - /* - * At this point we know that quota was _not_ turned off. - * Since the mount flags are not indicating to us otherwise, this - * must mean that quota is on, and the dquot needs to be replayed. - * Remember that we may not have fully recovered the superblock yet, - * so we can't do the usual trick of looking at the SB quota bits. - * - * The other possibility, of course, is that the quota subsystem was - * removed since the last mount - ENOSYS. - */ - dq_f = item->ri_buf[0].i_addr; - ASSERT(dq_f); - fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); - if (fa) { - xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", - dq_f->qlf_id, fa); - return -EFSCORRUPTED; - } - ASSERT(dq_f->qlf_len == 1); - - /* - * At this point we are assuming that the dquots have been allocated - * and hence the buffer has valid dquots stamped in it. It should, - * therefore, pass verifier validation. If the dquot is bad, then the - * we'll return an error here, so we don't need to specifically check - * the dquot in the buffer after the verifier has run. - */ - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, - &xfs_dquot_buf_ops); - if (error) - return error; - - ASSERT(bp); - ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); - - /* - * If the dquot has an LSN in it, recover the dquot only if it's less - * than the lsn of the transaction we are replaying. - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; - xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); - - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - goto out_release; - } - } - - memcpy(ddq, recddq, item->ri_buf[1].i_len); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), - XFS_DQUOT_CRC_OFF); - } - - ASSERT(dq_f->qlf_size == 2); - ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp, buffer_list); - -out_release: - xfs_buf_relse(bp); - return 0; -} - -/* - * This routine is called to create an in-core extent free intent - * item from the efi format structure which was logged on disk. - * It allocates an in-core efi, copies the extents from the format - * structure into it, and adds the efi to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_efi_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_efi_log_item *efip; - struct xfs_efi_log_format *efi_formatp; - - efi_formatp = item->ri_buf[0].i_addr; - - efip = xfs_efi_init(mp, efi_formatp->efi_nextents); - error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); - if (error) { - xfs_efi_item_free(efip); - return error; - } - atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The EFI has two references. One for the EFD and one for EFI to ensure - * it makes it into the AIL. Insert the EFI into the AIL directly and - * drop the EFI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); - xfs_efi_release(efip); - return 0; -} - - -/* - * This routine is called when an EFD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding EFI if it - * was still in the log. To do this it searches the AIL for the EFI with an id - * equal to that in the EFD format structure. If we find it we drop the EFD - * reference, which removes the EFI from the AIL and frees it. - */ -STATIC int -xlog_recover_efd_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - xfs_efd_log_format_t *efd_formatp; - xfs_efi_log_item_t *efip = NULL; - struct xfs_log_item *lip; - uint64_t efi_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - efd_formatp = item->ri_buf[0].i_addr; - ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || - (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); - efi_id = efd_formatp->efd_efi_id; - - /* - * Search for the EFI with the id in the EFD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_EFI) { - efip = (xfs_efi_log_item_t *)lip; - if (efip->efi_format.efi_id == efi_id) { - /* - * Drop the EFD reference to the EFI. This - * removes the EFI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_efi_release(efip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * This routine is called to create an in-core extent rmap update - * item from the rui format structure which was logged on disk. - * It allocates an in-core rui, copies the extents from the format - * structure into it, and adds the rui to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_rui_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_rui_log_item *ruip; - struct xfs_rui_log_format *rui_formatp; - - rui_formatp = item->ri_buf[0].i_addr; - - ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); - error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); - if (error) { - xfs_rui_item_free(ruip); - return error; - } - atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The RUI has two references. One for the RUD and one for RUI to ensure - * it makes it into the AIL. Insert the RUI into the AIL directly and - * drop the RUI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn); - xfs_rui_release(ruip); - return 0; -} - - -/* - * This routine is called when an RUD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding RUI if it - * was still in the log. To do this it searches the AIL for the RUI with an id - * equal to that in the RUD format structure. If we find it we drop the RUD - * reference, which removes the RUI from the AIL and frees it. - */ -STATIC int -xlog_recover_rud_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_rud_log_format *rud_formatp; - struct xfs_rui_log_item *ruip = NULL; - struct xfs_log_item *lip; - uint64_t rui_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - rud_formatp = item->ri_buf[0].i_addr; - ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); - rui_id = rud_formatp->rud_rui_id; - - /* - * Search for the RUI with the id in the RUD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_RUI) { - ruip = (struct xfs_rui_log_item *)lip; - if (ruip->rui_format.rui_id == rui_id) { - /* - * Drop the RUD reference to the RUI. This - * removes the RUI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_rui_release(ruip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * Copy an CUI format buffer from the given buf, and into the destination - * CUI format structure. The CUI/CUD items were designed not to need any - * special alignment handling. - */ -static int -xfs_cui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_cui_log_format *dst_cui_fmt) -{ - struct xfs_cui_log_format *src_cui_fmt; - uint len; - - src_cui_fmt = buf->i_addr; - len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); - - if (buf->i_len == len) { - memcpy(dst_cui_fmt, src_cui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; -} - -/* - * This routine is called to create an in-core extent refcount update - * item from the cui format structure which was logged on disk. - * It allocates an in-core cui, copies the extents from the format - * structure into it, and adds the cui to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_cui_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_cui_log_item *cuip; - struct xfs_cui_log_format *cui_formatp; - - cui_formatp = item->ri_buf[0].i_addr; - - cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); - error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); - if (error) { - xfs_cui_item_free(cuip); - return error; - } - atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The CUI has two references. One for the CUD and one for CUI to ensure - * it makes it into the AIL. Insert the CUI into the AIL directly and - * drop the CUI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn); - xfs_cui_release(cuip); - return 0; -} - - -/* - * This routine is called when an CUD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding CUI if it - * was still in the log. To do this it searches the AIL for the CUI with an id - * equal to that in the CUD format structure. If we find it we drop the CUD - * reference, which removes the CUI from the AIL and frees it. - */ -STATIC int -xlog_recover_cud_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_cud_log_format *cud_formatp; - struct xfs_cui_log_item *cuip = NULL; - struct xfs_log_item *lip; - uint64_t cui_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - cud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); - return -EFSCORRUPTED; - } - cui_id = cud_formatp->cud_cui_id; - - /* - * Search for the CUI with the id in the CUD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_CUI) { - cuip = (struct xfs_cui_log_item *)lip; - if (cuip->cui_format.cui_id == cui_id) { - /* - * Drop the CUD reference to the CUI. This - * removes the CUI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_cui_release(cuip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * Copy an BUI format buffer from the given buf, and into the destination - * BUI format structure. The BUI/BUD items were designed not to need any - * special alignment handling. - */ -static int -xfs_bui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_bui_log_format *dst_bui_fmt) -{ - struct xfs_bui_log_format *src_bui_fmt; - uint len; - - src_bui_fmt = buf->i_addr; - len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); - - if (buf->i_len == len) { - memcpy(dst_bui_fmt, src_bui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; -} - -/* - * This routine is called to create an in-core extent bmap update - * item from the bui format structure which was logged on disk. - * It allocates an in-core bui, copies the extents from the format - * structure into it, and adds the bui to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_bui_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_bui_log_item *buip; - struct xfs_bui_log_format *bui_formatp; - - bui_formatp = item->ri_buf[0].i_addr; - - if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); - return -EFSCORRUPTED; - } - buip = xfs_bui_init(mp); - error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); - if (error) { - xfs_bui_item_free(buip); - return error; - } - atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The RUI has two references. One for the RUD and one for RUI to ensure - * it makes it into the AIL. Insert the RUI into the AIL directly and - * drop the RUI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn); - xfs_bui_release(buip); - return 0; -} - - -/* - * This routine is called when an BUD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding BUI if it - * was still in the log. To do this it searches the AIL for the BUI with an id - * equal to that in the BUD format structure. If we find it we drop the BUD - * reference, which removes the BUI from the AIL and frees it. - */ -STATIC int -xlog_recover_bud_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_bud_log_format *bud_formatp; - struct xfs_bui_log_item *buip = NULL; - struct xfs_log_item *lip; - uint64_t bui_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - bud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); - return -EFSCORRUPTED; - } - bui_id = bud_formatp->bud_bui_id; - - /* - * Search for the BUI with the id in the BUD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_BUI) { - buip = (struct xfs_bui_log_item *)lip; - if (buip->bui_format.bui_id == bui_id) { - /* - * Drop the BUD reference to the BUI. This - * removes the BUI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_bui_release(buip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * This routine is called when an inode create format structure is found in a - * committed transaction in the log. It's purpose is to initialise the inodes - * being allocated on disk. This requires us to get inode cluster buffers that - * match the range to be initialised, stamped with inode templates and written - * by delayed write so that subsequent modifications will hit the cached buffer - * and only need writing out at the end of recovery. - */ -STATIC int -xlog_recover_do_icreate_pass2( - struct xlog *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_icreate_log *icl; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_agnumber_t agno; - xfs_agblock_t agbno; - unsigned int count; - unsigned int isize; - xfs_agblock_t length; - int bb_per_cluster; - int cancel_count; - int nbufs; - int i; - - icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; - if (icl->icl_type != XFS_LI_ICREATE) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); - return -EINVAL; - } - - if (icl->icl_size != 1) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); - return -EINVAL; - } - - agno = be32_to_cpu(icl->icl_ag); - if (agno >= mp->m_sb.sb_agcount) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); - return -EINVAL; - } - agbno = be32_to_cpu(icl->icl_agbno); - if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); - return -EINVAL; - } - isize = be32_to_cpu(icl->icl_isize); - if (isize != mp->m_sb.sb_inodesize) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); - return -EINVAL; - } - count = be32_to_cpu(icl->icl_count); - if (!count) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); - return -EINVAL; - } - length = be32_to_cpu(icl->icl_length); - if (!length || length >= mp->m_sb.sb_agblocks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); - return -EINVAL; - } - - /* - * The inode chunk is either full or sparse and we only support - * m_ino_geo.ialloc_min_blks sized sparse allocations at this time. - */ - if (length != igeo->ialloc_blks && - length != igeo->ialloc_min_blks) { - xfs_warn(log->l_mp, - "%s: unsupported chunk length", __FUNCTION__); - return -EINVAL; - } - - /* verify inode count is consistent with extent length */ - if ((count >> mp->m_sb.sb_inopblog) != length) { - xfs_warn(log->l_mp, - "%s: inconsistent inode count and chunk length", - __FUNCTION__); - return -EINVAL; - } - - /* - * The icreate transaction can cover multiple cluster buffers and these - * buffers could have been freed and reused. Check the individual - * buffers for cancellation so we don't overwrite anything written after - * a cancellation. - */ - bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); - nbufs = length / igeo->blocks_per_cluster; - for (i = 0, cancel_count = 0; i < nbufs; i++) { - xfs_daddr_t daddr; - - daddr = XFS_AGB_TO_DADDR(mp, agno, - agbno + i * igeo->blocks_per_cluster); - if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) - cancel_count++; - } - - /* - * We currently only use icreate for a single allocation at a time. This - * means we should expect either all or none of the buffers to be - * cancelled. Be conservative and skip replay if at least one buffer is - * cancelled, but warn the user that something is awry if the buffers - * are not consistent. - * - * XXX: This must be refined to only skip cancelled clusters once we use - * icreate for multiple chunk allocations. - */ - ASSERT(!cancel_count || cancel_count == nbufs); - if (cancel_count) { - if (cancel_count != nbufs) - xfs_warn(mp, - "WARNING: partial inode chunk cancellation, skipped icreate."); - trace_xfs_log_recover_icreate_cancel(log, icl); - return 0; - } - - trace_xfs_log_recover_icreate_recover(log, icl); - return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, - length, be32_to_cpu(icl->icl_gen)); -} - -STATIC void -xlog_recover_buffer_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; - struct xfs_mount *mp = log->l_mp; - - if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, - buf_f->blf_len, buf_f->blf_flags)) { - return; - } - - xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, - buf_f->blf_len, NULL); -} - -STATIC void -xlog_recover_inode_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_inode_log_format ilf_buf; - struct xfs_inode_log_format *ilfp; - struct xfs_mount *mp = log->l_mp; - int error; - - if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { - ilfp = item->ri_buf[0].i_addr; - } else { - ilfp = &ilf_buf; - memset(ilfp, 0, sizeof(*ilfp)); - error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); - if (error) - return; - } - - if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) - return; - - xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, - ilfp->ilf_len, &xfs_inode_buf_ra_ops); -} - -STATIC void -xlog_recover_dquot_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_disk_dquot *recddq; - struct xfs_dq_logformat *dq_f; - uint type; - int len; - - - if (mp->m_qflags == 0) - return; - - recddq = item->ri_buf[1].i_addr; - if (recddq == NULL) - return; - if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) - return; - - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); - ASSERT(type); - if (log->l_quotaoffs_flag & type) - return; - - dq_f = item->ri_buf[0].i_addr; - ASSERT(dq_f); - ASSERT(dq_f->qlf_len == 1); - - len = XFS_FSB_TO_BB(mp, dq_f->qlf_len); - if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0)) - return; - - xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len, - &xfs_dquot_buf_ra_ops); -} - -STATIC void -xlog_recover_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - xlog_recover_buffer_ra_pass2(log, item); - break; - case XFS_LI_INODE: - xlog_recover_inode_ra_pass2(log, item); - break; - case XFS_LI_DQUOT: - xlog_recover_dquot_ra_pass2(log, item); - break; - case XFS_LI_EFI: - case XFS_LI_EFD: - case XFS_LI_QUOTAOFF: - case XFS_LI_RUI: - case XFS_LI_RUD: - case XFS_LI_CUI: - case XFS_LI_CUD: - case XFS_LI_BUI: - case XFS_LI_BUD: - default: - break; - } -} - -STATIC int -xlog_recover_commit_pass1( - struct xlog *log, - struct xlog_recover *trans, - struct xlog_recover_item *item) -{ - trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); - - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - return xlog_recover_buffer_pass1(log, item); - case XFS_LI_QUOTAOFF: - return xlog_recover_quotaoff_pass1(log, item); - case XFS_LI_INODE: - case XFS_LI_EFI: - case XFS_LI_EFD: - case XFS_LI_DQUOT: - case XFS_LI_ICREATE: - case XFS_LI_RUI: - case XFS_LI_RUD: - case XFS_LI_CUI: - case XFS_LI_CUD: - case XFS_LI_BUI: - case XFS_LI_BUD: - /* nothing to do in pass 1 */ - return 0; - default: - xfs_warn(log->l_mp, "%s: invalid item type (%d)", - __func__, ITEM_TYPE(item)); - ASSERT(0); - return -EFSCORRUPTED; - } -} - -STATIC int -xlog_recover_commit_pass2( - struct xlog *log, - struct xlog_recover *trans, - struct list_head *buffer_list, - struct xlog_recover_item *item) -{ - trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); - - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - return xlog_recover_buffer_pass2(log, buffer_list, item, - trans->r_lsn); - case XFS_LI_INODE: - return xlog_recover_inode_pass2(log, buffer_list, item, - trans->r_lsn); - case XFS_LI_EFI: - return xlog_recover_efi_pass2(log, item, trans->r_lsn); - case XFS_LI_EFD: - return xlog_recover_efd_pass2(log, item); - case XFS_LI_RUI: - return xlog_recover_rui_pass2(log, item, trans->r_lsn); - case XFS_LI_RUD: - return xlog_recover_rud_pass2(log, item); - case XFS_LI_CUI: - return xlog_recover_cui_pass2(log, item, trans->r_lsn); - case XFS_LI_CUD: - return xlog_recover_cud_pass2(log, item); - case XFS_LI_BUI: - return xlog_recover_bui_pass2(log, item, trans->r_lsn); - case XFS_LI_BUD: - return xlog_recover_bud_pass2(log, item); - case XFS_LI_DQUOT: - return xlog_recover_dquot_pass2(log, buffer_list, item, - trans->r_lsn); - case XFS_LI_ICREATE: - return xlog_recover_do_icreate_pass2(log, buffer_list, item); - case XFS_LI_QUOTAOFF: - /* nothing to do in pass2 */ - return 0; - default: - xfs_warn(log->l_mp, "%s: invalid item type (%d)", - __func__, ITEM_TYPE(item)); - ASSERT(0); - return -EFSCORRUPTED; - } + if (!xlog_is_buffer_cancelled(log, blkno, len)) + xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops); } STATIC int @@ -4072,8 +1967,12 @@ xlog_recover_items_pass2( int error = 0; list_for_each_entry(item, item_list, ri_list) { - error = xlog_recover_commit_pass2(log, trans, - buffer_list, item); + trace_xfs_log_recover_item_recover(log, trans, item, + XLOG_RECOVER_PASS2); + + if (item->ri_ops->commit_pass2) + error = item->ri_ops->commit_pass2(log, buffer_list, + item, trans->r_lsn); if (error) return error; } @@ -4110,12 +2009,16 @@ xlog_recover_commit_trans( return error; list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { + trace_xfs_log_recover_item_recover(log, trans, item, pass); + switch (pass) { case XLOG_RECOVER_PASS1: - error = xlog_recover_commit_pass1(log, trans, item); + if (item->ri_ops->commit_pass1) + error = item->ri_ops->commit_pass1(log, item); break; case XLOG_RECOVER_PASS2: - xlog_recover_ra_pass2(log, item); + if (item->ri_ops->ra_pass2) + item->ri_ops->ra_pass2(log, item); list_move_tail(&item->ri_list, &ra_list); items_queued++; if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { @@ -4152,9 +2055,9 @@ STATIC void xlog_recover_add_item( struct list_head *head) { - xlog_recover_item_t *item; + struct xlog_recover_item *item; - item = kmem_zalloc(sizeof(xlog_recover_item_t), 0); + item = kmem_zalloc(sizeof(struct xlog_recover_item), 0); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } @@ -4166,7 +2069,7 @@ xlog_recover_add_to_cont_trans( char *dp, int len) { - xlog_recover_item_t *item; + struct xlog_recover_item *item; char *ptr, *old_ptr; int old_len; @@ -4189,7 +2092,8 @@ xlog_recover_add_to_cont_trans( } /* take the tail entry */ - item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, + ri_list); old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; @@ -4223,7 +2127,7 @@ xlog_recover_add_to_trans( int len) { struct xfs_inode_log_format *in_f; /* any will do */ - xlog_recover_item_t *item; + struct xlog_recover_item *item; char *ptr; if (!len) @@ -4259,13 +2163,14 @@ xlog_recover_add_to_trans( in_f = (struct xfs_inode_log_format *)ptr; /* take the tail entry */ - item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, + ri_list); if (item->ri_total != 0 && item->ri_total == item->ri_cnt) { /* tail item is in use, get a new one */ xlog_recover_add_item(&trans->r_itemq); item = list_entry(trans->r_itemq.prev, - xlog_recover_item_t, ri_list); + struct xlog_recover_item, ri_list); } if (item->ri_total == 0) { /* first region to be added */ @@ -4311,7 +2216,7 @@ STATIC void xlog_recover_free_trans( struct xlog_recover *trans) { - xlog_recover_item_t *item, *n; + struct xlog_recover_item *item, *n; int i; hlist_del_init(&trans->r_list); @@ -4563,180 +2468,6 @@ xlog_recover_process_data( return 0; } -/* Recover the EFI if necessary. */ -STATIC int -xlog_recover_process_efi( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_efi_log_item *efip; - int error; - - /* - * Skip EFIs that we've already processed. - */ - efip = container_of(lip, struct xfs_efi_log_item, efi_item); - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_efi_recover(mp, efip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the EFI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_efi( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_efi_log_item *efip; - - efip = container_of(lip, struct xfs_efi_log_item, efi_item); - - spin_unlock(&ailp->ail_lock); - xfs_efi_release(efip); - spin_lock(&ailp->ail_lock); -} - -/* Recover the RUI if necessary. */ -STATIC int -xlog_recover_process_rui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_rui_log_item *ruip; - int error; - - /* - * Skip RUIs that we've already processed. - */ - ruip = container_of(lip, struct xfs_rui_log_item, rui_item); - if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_rui_recover(mp, ruip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the RUI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_rui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_rui_log_item *ruip; - - ruip = container_of(lip, struct xfs_rui_log_item, rui_item); - - spin_unlock(&ailp->ail_lock); - xfs_rui_release(ruip); - spin_lock(&ailp->ail_lock); -} - -/* Recover the CUI if necessary. */ -STATIC int -xlog_recover_process_cui( - struct xfs_trans *parent_tp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_cui_log_item *cuip; - int error; - - /* - * Skip CUIs that we've already processed. - */ - cuip = container_of(lip, struct xfs_cui_log_item, cui_item); - if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_cui_recover(parent_tp, cuip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the CUI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_cui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_cui_log_item *cuip; - - cuip = container_of(lip, struct xfs_cui_log_item, cui_item); - - spin_unlock(&ailp->ail_lock); - xfs_cui_release(cuip); - spin_lock(&ailp->ail_lock); -} - -/* Recover the BUI if necessary. */ -STATIC int -xlog_recover_process_bui( - struct xfs_trans *parent_tp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_bui_log_item *buip; - int error; - - /* - * Skip BUIs that we've already processed. - */ - buip = container_of(lip, struct xfs_bui_log_item, bui_item); - if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_bui_recover(parent_tp, buip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the BUI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_bui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_bui_log_item *buip; - - buip = container_of(lip, struct xfs_bui_log_item, bui_item); - - spin_unlock(&ailp->ail_lock); - xfs_bui_release(buip); - spin_lock(&ailp->ail_lock); -} - -/* Is this log item a deferred action intent? */ -static inline bool xlog_item_is_intent(struct xfs_log_item *lip) -{ - switch (lip->li_type) { - case XFS_LI_EFI: - case XFS_LI_RUI: - case XFS_LI_CUI: - case XFS_LI_BUI: - return true; - default: - return false; - } -} - /* Take all the collected deferred ops and finish them in order. */ static int xlog_finish_defer_ops( @@ -4771,6 +2502,13 @@ xlog_finish_defer_ops( return xfs_trans_commit(tp); } +/* Is this log item a deferred action intent? */ +static inline bool xlog_item_is_intent(struct xfs_log_item *lip) +{ + return lip->li_ops->iop_recover != NULL && + lip->li_ops->iop_match != NULL; +} + /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -4841,23 +2579,14 @@ xlog_recover_process_intents( /* * NOTE: If your intent processing routine can create more - * deferred ops, you /must/ attach them to the dfops in this - * routine or else those subsequent intents will get + * deferred ops, you /must/ attach them to the transaction in + * this routine or else those subsequent intents will get * replayed in the wrong order! */ - switch (lip->li_type) { - case XFS_LI_EFI: - error = xlog_recover_process_efi(log->l_mp, ailp, lip); - break; - case XFS_LI_RUI: - error = xlog_recover_process_rui(log->l_mp, ailp, lip); - break; - case XFS_LI_CUI: - error = xlog_recover_process_cui(parent_tp, ailp, lip); - break; - case XFS_LI_BUI: - error = xlog_recover_process_bui(parent_tp, ailp, lip); - break; + if (!test_and_set_bit(XFS_LI_RECOVERED, &lip->li_flags)) { + spin_unlock(&ailp->ail_lock); + error = lip->li_ops->iop_recover(lip, parent_tp); + spin_lock(&ailp->ail_lock); } if (error) goto out; @@ -4901,21 +2630,9 @@ xlog_recover_cancel_intents( break; } - switch (lip->li_type) { - case XFS_LI_EFI: - xlog_recover_cancel_efi(log->l_mp, ailp, lip); - break; - case XFS_LI_RUI: - xlog_recover_cancel_rui(log->l_mp, ailp, lip); - break; - case XFS_LI_CUI: - xlog_recover_cancel_cui(log->l_mp, ailp, lip); - break; - case XFS_LI_BUI: - xlog_recover_cancel_bui(log->l_mp, ailp, lip); - break; - } - + spin_unlock(&ailp->ail_lock); + lip->li_ops->iop_release(lip); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_next(ailp, &cur); } @@ -4987,7 +2704,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0); if (error) goto fail_iput; diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index e0f9d3b6abe9..bc66d95c8d4c 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -117,3 +117,25 @@ xfs_hex_dump(const void *p, int length) { print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); } + +void +xfs_buf_alert_ratelimited( + struct xfs_buf *bp, + const char *rlmsg, + const char *fmt, + ...) +{ + struct xfs_mount *mp = bp->b_mount; + struct va_format vaf; + va_list args; + + /* use the more aggressive per-target rate limit for buffers */ + if (!___ratelimit(&bp->b_target->bt_ioerror_rl, rlmsg)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + __xfs_printk(KERN_ALERT, mp, &vaf); + va_end(args); +} diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 0b05e10995a0..4d9bd6bb63ca 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -31,15 +31,27 @@ void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) } #endif -#define xfs_printk_ratelimited(func, dev, fmt, ...) \ +#define xfs_printk_ratelimited(func, dev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ - func(dev, fmt, ##__VA_ARGS__); \ + func(dev, fmt, ##__VA_ARGS__); \ } while (0) +#define xfs_printk_once(func, dev, fmt, ...) \ +({ \ + static bool __section(.data.once) __print_once; \ + bool __ret_print_once = !__print_once; \ + \ + if (!__print_once) { \ + __print_once = true; \ + func(dev, fmt, ##__VA_ARGS__); \ + } \ + unlikely(__ret_print_once); \ +}) + #define xfs_emerg_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) #define xfs_alert_ratelimited(dev, fmt, ...) \ @@ -57,9 +69,17 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_once(dev, fmt, ...) \ + xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) +#define xfs_notice_once(dev, fmt, ...) \ + xfs_printk_once(xfs_notice, dev, fmt, ##__VA_ARGS__) + void assfail(struct xfs_mount *mp, char *expr, char *f, int l); void asswarn(struct xfs_mount *mp, char *expr, char *f, int l); extern void xfs_hex_dump(const void *p, int length); +void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg, + const char *fmt, ...); + #endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c5513e5a226a..d5dcf9869860 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1190,39 +1190,6 @@ xfs_log_sbcount(xfs_mount_t *mp) } /* - * Deltas for the inode count are +/-64, hence we use a large batch size - * of 128 so we don't need to take the counter lock on every update. - */ -#define XFS_ICOUNT_BATCH 128 -int -xfs_mod_icount( - struct xfs_mount *mp, - int64_t delta) -{ - percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH); - if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { - ASSERT(0); - percpu_counter_add(&mp->m_icount, -delta); - return -EINVAL; - } - return 0; -} - -int -xfs_mod_ifree( - struct xfs_mount *mp, - int64_t delta) -{ - percpu_counter_add(&mp->m_ifree, delta); - if (percpu_counter_compare(&mp->m_ifree, 0) < 0) { - ASSERT(0); - percpu_counter_add(&mp->m_ifree, -delta); - return -EINVAL; - } - return 0; -} - -/* * Deltas for the block count can vary from 1 to very large, but lock contention * only occurs on frequent small block count updates such as in the delayed * allocation path for buffered writes (page a time updates). Hence we set @@ -1300,10 +1267,9 @@ xfs_mod_fdblocks( spin_unlock(&mp->m_sb_lock); return 0; } - printk_once(KERN_WARNING - "Filesystem \"%s\": reserve blocks depleted! " - "Consider increasing reserve pool size.", - mp->m_super->s_id); + xfs_warn_once(mp, +"Reserve blocks depleted! Consider increasing reserve pool size."); + fdblocks_enospc: spin_unlock(&mp->m_sb_lock); return -ENOSPC; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b2e4598fdf7d..3725d25ad97e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -55,61 +55,25 @@ struct xfs_error_cfg { long retry_timeout; /* in jiffies, -1 = infinite */ }; +/* + * The struct xfsmount layout is optimised to separate read-mostly variables + * from variables that are frequently modified. We put the read-mostly variables + * first, then place all the other variables at the end. + * + * Typically, read-mostly variables are those that are set at mount time and + * never changed again, or only change rarely as a result of things like sysfs + * knobs being tweaked. + */ typedef struct xfs_mount { + struct xfs_sb m_sb; /* copy of fs superblock */ struct super_block *m_super; - - /* - * Bitsets of per-fs metadata that have been checked and/or are sick. - * Callers must hold m_sb_lock to access these two fields. - */ - uint8_t m_fs_checked; - uint8_t m_fs_sick; - /* - * Bitsets of rt metadata that have been checked and/or are sick. - * Callers must hold m_sb_lock to access this field. - */ - uint8_t m_rt_checked; - uint8_t m_rt_sick; - struct xfs_ail *m_ail; /* fs active log item list */ - - struct xfs_sb m_sb; /* copy of fs superblock */ - spinlock_t m_sb_lock; /* sb counter lock */ - struct percpu_counter m_icount; /* allocated inodes counter */ - struct percpu_counter m_ifree; /* free inodes counter */ - struct percpu_counter m_fdblocks; /* free block counter */ - /* - * Count of data device blocks reserved for delayed allocations, - * including indlen blocks. Does not include allocated CoW staging - * extents or anything related to the rt device. - */ - struct percpu_counter m_delalloc_blks; - struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_rtname; /* realtime device name */ char *m_logname; /* external log device name */ - int m_bsize; /* fs logical block size */ - xfs_agnumber_t m_agfrotor; /* last ag where space found */ - xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ - spinlock_t m_agirotor_lock;/* .. and lock protecting it */ - xfs_agnumber_t m_maxagi; /* highest inode alloc group */ - uint m_allocsize_log;/* min write size log bytes */ - uint m_allocsize_blocks; /* min write size blocks */ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ - struct xfs_ino_geometry m_ino_geo; /* inode geometry */ - int m_logbufs; /* number of log buffers */ - int m_logbsize; /* size of each log buffer */ - uint m_rsumlevels; /* rt summary levels */ - uint m_rsumsize; /* size of rt summary, bytes */ - /* - * Optional cache of rt summary level per bitmap block with the - * invariant that m_rsum_cache[bbno] <= the minimum i for which - * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip - * inode lock. - */ - uint8_t *m_rsum_cache; struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ struct xfs_inode *m_rsumip; /* pointer to summary inode */ struct xfs_inode *m_rootip; /* pointer to root directory */ @@ -117,9 +81,26 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + /* + * Optional cache of rt summary level per bitmap block with the + * invariant that m_rsum_cache[bbno] <= the minimum i for which + * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip + * inode lock. + */ + uint8_t *m_rsum_cache; + struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ + struct workqueue_struct *m_buf_workqueue; + struct workqueue_struct *m_unwritten_workqueue; + struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_sync_workqueue; + + int m_bsize; /* fs logical block size */ uint8_t m_blkbit_log; /* blocklog + NBBY */ uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ uint8_t m_agno_log; /* log #ag's */ + uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ @@ -138,47 +119,82 @@ typedef struct xfs_mount { xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ - struct radix_tree_root m_perag_tree; /* per-ag accounting info */ - spinlock_t m_perag_lock; /* lock for m_perag_tree */ - struct mutex m_growlock; /* growfs mutex */ + int m_dalign; /* stripe unit */ + int m_swidth; /* stripe width */ + xfs_agnumber_t m_maxagi; /* highest inode alloc group */ + uint m_allocsize_log;/* min write size log bytes */ + uint m_allocsize_blocks; /* min write size blocks */ + int m_logbufs; /* number of log buffers */ + int m_logbsize; /* size of each log buffer */ + uint m_rsumlevels; /* rt summary levels */ + uint m_rsumsize; /* size of rt summary, bytes */ int m_fixedfsid[2]; /* unchanged for life of FS */ - uint64_t m_flags; /* global mount flags */ - bool m_finobt_nores; /* no per-AG finobt resv. */ uint m_qflags; /* quota status flags */ + uint64_t m_flags; /* global mount flags */ + int64_t m_low_space[XFS_LOWSP_MAX]; + struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ + /* low free space thresholds */ + bool m_always_cow; + bool m_fail_unmount; + bool m_finobt_nores; /* no per-AG finobt resv. */ + bool m_update_sb; /* sb needs update in mount */ + + /* + * Bitsets of per-fs metadata that have been checked and/or are sick. + * Callers must hold m_sb_lock to access these two fields. + */ + uint8_t m_fs_checked; + uint8_t m_fs_sick; + /* + * Bitsets of rt metadata that have been checked and/or are sick. + * Callers must hold m_sb_lock to access this field. + */ + uint8_t m_rt_checked; + uint8_t m_rt_sick; + + /* + * End of read-mostly variables. Frequently written variables and locks + * should be placed below this comment from now on. The first variable + * here is marked as cacheline aligned so they it is separated from + * the read-mostly variables. + */ + + spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */ + struct percpu_counter m_icount; /* allocated inodes counter */ + struct percpu_counter m_ifree; /* free inodes counter */ + struct percpu_counter m_fdblocks; /* free block counter */ + /* + * Count of data device blocks reserved for delayed allocations, + * including indlen blocks. Does not include allocated CoW staging + * extents or anything related to the rt device. + */ + struct percpu_counter m_delalloc_blks; + + struct radix_tree_root m_perag_tree; /* per-ag accounting info */ + spinlock_t m_perag_lock; /* lock for m_perag_tree */ uint64_t m_resblks; /* total reserved blocks */ uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ - int m_dalign; /* stripe unit */ - int m_swidth; /* stripe width */ - uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ - atomic_t m_active_trans; /* number trans frozen */ - struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks trimming */ struct delayed_work m_cowblocks_work; /* background cow blocks trimming */ - bool m_update_sb; /* sb needs update in mount */ - int64_t m_low_space[XFS_LOWSP_MAX]; - /* low free space thresholds */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ + xfs_agnumber_t m_agfrotor; /* last ag where space found */ + xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ + spinlock_t m_agirotor_lock;/* .. and lock protecting it */ /* * Workqueue item so that we can coalesce multiple inode flush attempts * into a single flush. */ struct work_struct m_flush_inodes_work; - struct workqueue_struct *m_buf_workqueue; - struct workqueue_struct *m_unwritten_workqueue; - struct workqueue_struct *m_cil_workqueue; - struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_eofblocks_workqueue; - struct workqueue_struct *m_sync_workqueue; /* * Generation of the filesysyem layout. This is incremented by each @@ -190,9 +206,8 @@ typedef struct xfs_mount { * to various other kinds of pain inflicted on the pNFS server. */ uint32_t m_generation; + struct mutex m_growlock; /* growfs mutex */ - bool m_always_cow; - bool m_fail_unmount; #ifdef DEBUG /* * Frequency with which errors are injected. Replaces xfs_etest; the @@ -237,8 +252,8 @@ typedef struct xfs_mount { #define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams allocator */ #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ - -#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ +#define XFS_MOUNT_DAX_ALWAYS (1ULL << 26) +#define XFS_MOUNT_DAX_NEVER (1ULL << 27) /* * Max and min values for mount-option defined I/O @@ -259,8 +274,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ #define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ -#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ -#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ /* * Flags for xfs_mountfs @@ -394,8 +407,6 @@ extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta); -extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta); extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index bb3008d390aa..b101feb2aab4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -58,9 +58,8 @@ xfs_fs_get_uuid( { struct xfs_mount *mp = XFS_M(sb); - printk_once(KERN_NOTICE -"XFS (%s): using experimental pNFS feature, use at your own risk!\n", - mp->m_super->s_id); + xfs_notice_once(mp, +"Using experimental pNFS feature, use at your own risk!"); if (*len < sizeof(uuid_t)) return -EINVAL; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index c225691fad15..d6cd83317344 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -558,7 +558,7 @@ xfs_qm_set_defquota( return; ddqp = &dqp->q_core; - defq = xfs_get_defquota(dqp, qinf); + defq = xfs_get_defquota(qinf, xfs_dquot_type(dqp)); /* * Timers and warnings have been already set, let's just set the @@ -577,19 +577,22 @@ xfs_qm_set_defquota( static void xfs_qm_init_timelimits( struct xfs_mount *mp, - struct xfs_quotainfo *qinf) + uint type) { + struct xfs_quotainfo *qinf = mp->m_quotainfo; + struct xfs_def_quota *defq; struct xfs_disk_dquot *ddqp; struct xfs_dquot *dqp; - uint type; int error; - qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + defq = xfs_get_defquota(qinf, type); + + defq->btimelimit = XFS_QM_BTIMELIMIT; + defq->itimelimit = XFS_QM_ITIMELIMIT; + defq->rtbtimelimit = XFS_QM_RTBTIMELIMIT; + defq->bwarnlimit = XFS_QM_BWARNLIMIT; + defq->iwarnlimit = XFS_QM_IWARNLIMIT; + defq->rtbwarnlimit = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. @@ -597,39 +600,30 @@ xfs_qm_init_timelimits( * * Since we may not have done a quotacheck by this point, just read * the dquot without attaching it to any hashtables or lists. - * - * Timers and warnings are globally set by the first timer found in - * user/group/proj quota types, otherwise a default value is used. - * This should be split into different fields per quota type. */ - if (XFS_IS_UQUOTA_RUNNING(mp)) - type = XFS_DQ_USER; - else if (XFS_IS_GQUOTA_RUNNING(mp)) - type = XFS_DQ_GROUP; - else - type = XFS_DQ_PROJ; error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); if (error) return; ddqp = &dqp->q_core; + /* * The warnings and timers set the grace period given to * a user or group before he or she can not perform any * more writing. If it is zero, a default is used. */ if (ddqp->d_btimer) - qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer); + defq->btimelimit = be32_to_cpu(ddqp->d_btimer); if (ddqp->d_itimer) - qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer); + defq->itimelimit = be32_to_cpu(ddqp->d_itimer); if (ddqp->d_rtbtimer) - qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); + defq->rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); if (ddqp->d_bwarns) - qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns); + defq->bwarnlimit = be16_to_cpu(ddqp->d_bwarns); if (ddqp->d_iwarns) - qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns); + defq->iwarnlimit = be16_to_cpu(ddqp->d_iwarns); if (ddqp->d_rtbwarns) - qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); + defq->rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); xfs_qm_dqdestroy(dqp); } @@ -675,7 +669,9 @@ xfs_qm_init_quotainfo( mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); - xfs_qm_init_timelimits(mp, qinf); + xfs_qm_init_timelimits(mp, XFS_DQ_USER); + xfs_qm_init_timelimits(mp, XFS_DQ_GROUP); + xfs_qm_init_timelimits(mp, XFS_DQ_PROJ); if (XFS_IS_UQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); @@ -780,7 +776,8 @@ xfs_qm_qino_alloc( } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, - XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp); + need_alloc ? XFS_QM_QINOCREATE_SPACE_RES(mp) : 0, + 0, 0, &tp); if (error) return error; @@ -1116,7 +1113,7 @@ xfs_qm_quotacheck_dqadjust( */ if (dqp->q_core.d_id) { xfs_qm_adjust_dqlimits(mp, dqp); - xfs_qm_adjust_dqtimers(mp, &dqp->q_core); + xfs_qm_adjust_dqtimers(mp, dqp); } dqp->dq_flags |= XFS_DQ_DIRTY; @@ -1730,8 +1727,7 @@ xfs_qm_vop_dqalloc( pq = xfs_qm_dqhold(ip->i_pdquot); } } - if (uq) - trace_xfs_dquot_dqalloc(ip); + trace_xfs_dquot_dqalloc(ip); xfs_iunlock(ip, lockflags); if (O_udqpp) @@ -1808,7 +1804,7 @@ xfs_qm_vop_chown_reserve( { struct xfs_mount *mp = ip->i_mount; uint64_t delblks; - unsigned int blkflags, prjflags = 0; + unsigned int blkflags; struct xfs_dquot *udq_unres = NULL; struct xfs_dquot *gdq_unres = NULL; struct xfs_dquot *pdq_unres = NULL; @@ -1849,7 +1845,6 @@ xfs_qm_vop_chown_reserve( if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) { - prjflags = XFS_QMOPT_ENOSPC; pdq_delblks = pdqp; if (delblks) { ASSERT(ip->i_pdquot); @@ -1859,8 +1854,7 @@ xfs_qm_vop_chown_reserve( error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks, gdq_delblks, pdq_delblks, - ip->i_d.di_nblocks, 1, - flags | blkflags | prjflags); + ip->i_d.di_nblocks, 1, flags | blkflags); if (error) return error; @@ -1878,8 +1872,7 @@ xfs_qm_vop_chown_reserve( ASSERT(udq_unres || gdq_unres || pdq_unres); error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, udq_delblks, gdq_delblks, pdq_delblks, - (xfs_qcnt_t)delblks, 0, - flags | blkflags | prjflags); + (xfs_qcnt_t)delblks, 0, flags | blkflags); if (error) return error; xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, @@ -1932,7 +1925,6 @@ xfs_qm_vop_create_dqattach( return; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 4e57edca8bce..7b0e771fcbce 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -41,13 +41,20 @@ extern struct kmem_zone *xfs_qm_dqtrxzone; */ #define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 +/* Defaults for each quota type: time limits, warn limits, usage limits */ struct xfs_def_quota { - xfs_qcnt_t bhardlimit; /* default data blk hard limit */ - xfs_qcnt_t bsoftlimit; /* default data blk soft limit */ - xfs_qcnt_t ihardlimit; /* default inode count hard limit */ - xfs_qcnt_t isoftlimit; /* default inode count soft limit */ - xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */ - xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */ + time64_t btimelimit; /* limit for blks timer */ + time64_t itimelimit; /* limit for inodes timer */ + time64_t rtbtimelimit; /* limit for rt blks timer */ + xfs_qwarncnt_t bwarnlimit; /* limit for blks warnings */ + xfs_qwarncnt_t iwarnlimit; /* limit for inodes warnings */ + xfs_qwarncnt_t rtbwarnlimit; /* limit for rt blks warnings */ + xfs_qcnt_t bhardlimit; /* default data blk hard limit */ + xfs_qcnt_t bsoftlimit; /* default data blk soft limit */ + xfs_qcnt_t ihardlimit; /* default inode count hard limit */ + xfs_qcnt_t isoftlimit; /* default inode count soft limit */ + xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */ + xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */ }; /* @@ -55,28 +62,22 @@ struct xfs_def_quota { * The mount structure keeps a pointer to this. */ struct xfs_quotainfo { - struct radix_tree_root qi_uquota_tree; - struct radix_tree_root qi_gquota_tree; - struct radix_tree_root qi_pquota_tree; - struct mutex qi_tree_lock; + struct radix_tree_root qi_uquota_tree; + struct radix_tree_root qi_gquota_tree; + struct radix_tree_root qi_pquota_tree; + struct mutex qi_tree_lock; struct xfs_inode *qi_uquotaip; /* user quota inode */ struct xfs_inode *qi_gquotaip; /* group quota inode */ struct xfs_inode *qi_pquotaip; /* project quota inode */ - struct list_lru qi_lru; - int qi_dquots; - time64_t qi_btimelimit; /* limit for blks timer */ - time64_t qi_itimelimit; /* limit for inodes timer */ - time64_t qi_rtbtimelimit;/* limit for rt blks timer */ - xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ - xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ - xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ - struct mutex qi_quotaofflock;/* to serialize quotaoff */ - xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ - uint qi_dqperchunk; /* # ondisk dqs in above chunk */ + struct list_lru qi_lru; + int qi_dquots; + struct mutex qi_quotaofflock;/* to serialize quotaoff */ + xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ + uint qi_dqperchunk; /* # ondisk dq in above chunk */ struct xfs_def_quota qi_usr_default; struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; - struct shrinker qi_shrinker; + struct shrinker qi_shrinker; }; static inline struct radix_tree_root * @@ -113,6 +114,17 @@ xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) return NULL; } +static inline int +xfs_dquot_type(struct xfs_dquot *dqp) +{ + if (XFS_QM_ISUDQ(dqp)) + return XFS_DQ_USER; + if (XFS_QM_ISGDQ(dqp)) + return XFS_DQ_GROUP; + ASSERT(XFS_QM_ISPDQ(dqp)); + return XFS_DQ_PROJ; +} + extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, uint field, int64_t delta); extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); @@ -164,19 +176,19 @@ extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); static inline struct xfs_def_quota * -xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi) +xfs_get_defquota(struct xfs_quotainfo *qi, int type) { - struct xfs_def_quota *defq; - - if (XFS_QM_ISUDQ(dqp)) - defq = &qi->qi_usr_default; - else if (XFS_QM_ISGDQ(dqp)) - defq = &qi->qi_grp_default; - else { - ASSERT(XFS_QM_ISPDQ(dqp)); - defq = &qi->qi_prj_default; + switch (type) { + case XFS_DQ_USER: + return &qi->qi_usr_default; + case XFS_DQ_GROUP: + return &qi->qi_grp_default; + case XFS_DQ_PROJ: + return &qi->qi_prj_default; + default: + ASSERT(0); + return NULL; } - return defq; } #endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5d5ac65aa1cc..7effd7a28136 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -302,7 +302,7 @@ xfs_qm_scall_trunc_qfile( goto out_unlock; } - ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_df.if_nextents == 0); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); error = xfs_trans_commit(tp); @@ -357,11 +357,11 @@ xfs_qm_scall_quotaon( int error; uint qf; - flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); /* - * Switching on quota accounting must be done at mount time. + * Switching on quota accounting must be done at mount time, + * only consider quota enforcement stuff here. */ - flags &= ~(XFS_ALL_QUOTA_ACCT); + flags &= XFS_ALL_QUOTA_ENFD; if (flags == 0) { xfs_debug(mp, "%s: zero flags, m_qflags=%x", @@ -479,7 +479,7 @@ xfs_qm_scall_setqlim( goto out_unlock; } - defq = xfs_get_defquota(dqp, q); + defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); xfs_dqunlock(dqp); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); @@ -555,32 +555,40 @@ xfs_qm_scall_setqlim( ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns); if (id == 0) { - /* - * Timelimits for the super user set the relative time - * the other users can be over quota for this file system. - * If it is zero a default is used. Ditto for the default - * soft and hard limit values (already done, above), and - * for warnings. - */ - if (newlim->d_fieldmask & QC_SPC_TIMER) { - q->qi_btimelimit = newlim->d_spc_timer; - ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); - } - if (newlim->d_fieldmask & QC_INO_TIMER) { - q->qi_itimelimit = newlim->d_ino_timer; - ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); - } - if (newlim->d_fieldmask & QC_RT_SPC_TIMER) { - q->qi_rtbtimelimit = newlim->d_rt_spc_timer; - ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); - } if (newlim->d_fieldmask & QC_SPC_WARNS) - q->qi_bwarnlimit = newlim->d_spc_warns; + defq->bwarnlimit = newlim->d_spc_warns; if (newlim->d_fieldmask & QC_INO_WARNS) - q->qi_iwarnlimit = newlim->d_ino_warns; + defq->iwarnlimit = newlim->d_ino_warns; if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - q->qi_rtbwarnlimit = newlim->d_rt_spc_warns; - } else { + defq->rtbwarnlimit = newlim->d_rt_spc_warns; + } + + /* + * Timelimits for the super user set the relative time the other users + * can be over quota for this file system. If it is zero a default is + * used. Ditto for the default soft and hard limit values (already + * done, above), and for warnings. + * + * For other IDs, userspace can bump out the grace period if over + * the soft limit. + */ + if (newlim->d_fieldmask & QC_SPC_TIMER) + ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); + if (newlim->d_fieldmask & QC_INO_TIMER) + ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) + ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); + + if (id == 0) { + if (newlim->d_fieldmask & QC_SPC_TIMER) + defq->btimelimit = newlim->d_spc_timer; + if (newlim->d_fieldmask & QC_INO_TIMER) + defq->itimelimit = newlim->d_ino_timer; + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) + defq->rtbtimelimit = newlim->d_rt_spc_timer; + } + + if (id != 0) { /* * If the user is now over quota, start the timelimit. * The user will not be 'warned'. @@ -588,7 +596,7 @@ xfs_qm_scall_setqlim( * is on or off. We don't really want to bother with iterating * over all ondisk dquots and turning the timers on/off. */ - xfs_qm_adjust_dqtimers(mp, ddq); + xfs_qm_adjust_dqtimers(mp, dqp); } dqp->dq_flags |= XFS_DQ_DIRTY; xfs_trans_log_dquot(tp, dqp); @@ -729,9 +737,10 @@ xfs_qm_scall_getquota_next( STATIC int xfs_dqrele_inode( struct xfs_inode *ip, - int flags, void *args) { + uint *flags = args; + /* skip quota inodes */ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || ip == ip->i_mount->m_quotainfo->qi_gquotaip || @@ -743,15 +752,15 @@ xfs_dqrele_inode( } xfs_ilock(ip, XFS_ILOCK_EXCL); - if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { + if ((*flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } - if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { + if ((*flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } - if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { + if ((*flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { xfs_qm_dqrele(ip->i_pdquot); ip->i_pdquot = NULL; } @@ -768,10 +777,10 @@ xfs_dqrele_inode( */ void xfs_qm_dqrele_all_inodes( - struct xfs_mount *mp, - uint flags) + struct xfs_mount *mp, + uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL, - XFS_AGITER_INEW_WAIT); + xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, + &flags, XFS_ICI_NO_TAG); } diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 38669e827206..bf809b77a316 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -21,10 +21,10 @@ xfs_qm_fill_state( struct qc_type_state *tstate, struct xfs_mount *mp, struct xfs_inode *ip, - xfs_ino_t ino) + xfs_ino_t ino, + struct xfs_def_quota *defq) { - struct xfs_quotainfo *q = mp->m_quotainfo; - bool tempqip = false; + bool tempqip = false; tstate->ino = ino; if (!ip && ino == NULLFSINO) @@ -36,13 +36,13 @@ xfs_qm_fill_state( } tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_d.di_nblocks; - tstate->nextents = ip->i_d.di_nextents; - tstate->spc_timelimit = (u32)q->qi_btimelimit; - tstate->ino_timelimit = (u32)q->qi_itimelimit; - tstate->rt_spc_timelimit = (u32)q->qi_rtbtimelimit; - tstate->spc_warnlimit = q->qi_bwarnlimit; - tstate->ino_warnlimit = q->qi_iwarnlimit; - tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; + tstate->nextents = ip->i_df.if_nextents; + tstate->spc_timelimit = (u32)defq->btimelimit; + tstate->ino_timelimit = (u32)defq->itimelimit; + tstate->rt_spc_timelimit = (u32)defq->rtbtimelimit; + tstate->spc_warnlimit = defq->bwarnlimit; + tstate->ino_warnlimit = defq->iwarnlimit; + tstate->rt_spc_warnlimit = defq->rtbwarnlimit; if (tempqip) xfs_irele(ip); } @@ -77,11 +77,11 @@ xfs_fs_get_quota_state( state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED; xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip, - mp->m_sb.sb_uquotino); + mp->m_sb.sb_uquotino, &q->qi_usr_default); xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip, - mp->m_sb.sb_gquotino); + mp->m_sb.sb_gquotino, &q->qi_grp_default); xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip, - mp->m_sb.sb_pquotino); + mp->m_sb.sb_pquotino, &q->qi_prj_default); return 0; } @@ -109,8 +109,8 @@ xfs_fs_set_info( int type, struct qc_info *info) { - struct xfs_mount *mp = XFS_M(sb); - struct qc_dqblk newlim; + struct xfs_mount *mp = XFS_M(sb); + struct qc_dqblk newlim; if (sb_rdonly(sb)) return -EROFS; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 8eeed73928cd..c81639891e29 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -18,16 +18,20 @@ #include "xfs_log.h" #include "xfs_refcount.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_cui_zone; kmem_zone_t *xfs_cud_zone; +static const struct xfs_item_ops xfs_cui_item_ops; + static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_cui_log_item, cui_item); } -void +STATIC void xfs_cui_item_free( struct xfs_cui_log_item *cuip) { @@ -44,13 +48,13 @@ xfs_cui_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the CUI. */ -void +STATIC void xfs_cui_release( struct xfs_cui_log_item *cuip) { ASSERT(atomic_read(&cuip->cui_refcount) > 0); if (atomic_dec_and_test(&cuip->cui_refcount)) { - xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); xfs_cui_item_free(cuip); } } @@ -123,17 +127,10 @@ xfs_cui_item_release( xfs_cui_release(CUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_cui_item_ops = { - .iop_size = xfs_cui_item_size, - .iop_format = xfs_cui_item_format, - .iop_unpin = xfs_cui_item_unpin, - .iop_release = xfs_cui_item_release, -}; - /* * Allocate and initialize an cui item with the given number of extents. */ -struct xfs_cui_log_item * +STATIC struct xfs_cui_log_item * xfs_cui_init( struct xfs_mount *mp, uint nextents) @@ -284,27 +281,6 @@ xfs_refcount_update_diff_items( XFS_FSB_TO_AGNO(mp, rb->ri_startblock); } -/* Get an CUI. */ -STATIC void * -xfs_refcount_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_cui_log_item *cuip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - cuip = xfs_cui_init(tp->t_mountp, count); - ASSERT(cuip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &cuip->cui_item); - return cuip; -} - /* Set the phys extent flags for this reverse mapping. */ static void xfs_trans_set_refcount_flags( @@ -328,16 +304,12 @@ xfs_trans_set_refcount_flags( STATIC void xfs_refcount_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_cui_log_item *cuip, + struct xfs_refcount_intent *refc) { - struct xfs_cui_log_item *cuip = intent; - struct xfs_refcount_intent *refc; uint next_extent; struct xfs_phys_extent *ext; - refc = container_of(item, struct xfs_refcount_intent, ri_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); @@ -354,23 +326,44 @@ xfs_refcount_update_log_item( xfs_trans_set_refcount_flags(ext, refc->ri_type); } +static struct xfs_log_item * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); + struct xfs_refcount_intent *refc; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &cuip->cui_item); + if (sort) + list_sort(mp, items, xfs_refcount_update_diff_items); + list_for_each_entry(refc, items, ri_list) + xfs_refcount_update_log_item(tp, cuip, refc); + return &cuip->cui_item; +} + /* Get an CUD so we can process all the deferred refcount updates. */ -STATIC void * +static struct xfs_log_item * xfs_refcount_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_cud(tp, intent); + return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; } /* Process a deferred refcount update. */ STATIC int xfs_refcount_update_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_refcount_intent *refc; xfs_fsblock_t new_fsb; @@ -378,12 +371,10 @@ xfs_refcount_update_finish_item( int error; refc = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, done_item, - refc->ri_type, - refc->ri_startblock, - refc->ri_blockcount, - &new_fsb, &new_aglen, - (struct xfs_btree_cur **)state); + error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), + refc->ri_type, refc->ri_startblock, refc->ri_blockcount, + &new_fsb, &new_aglen, state); + /* Did we run out of reservation? Requeue what we didn't finish. */ if (!error && new_aglen > 0) { ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || @@ -396,24 +387,12 @@ xfs_refcount_update_finish_item( return error; } -/* Clean up after processing deferred refcounts. */ -STATIC void -xfs_refcount_update_finish_cleanup( - struct xfs_trans *tp, - void *state, - int error) -{ - struct xfs_btree_cur *rcur = state; - - xfs_refcount_finish_one_cleanup(tp, rcur, error); -} - /* Abort all pending CUIs. */ STATIC void xfs_refcount_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_cui_release(intent); + xfs_cui_release(CUI_ITEM(intent)); } /* Cancel a deferred refcount update. */ @@ -429,13 +408,11 @@ xfs_refcount_update_cancel_item( const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .diff_items = xfs_refcount_update_diff_items, .create_intent = xfs_refcount_update_create_intent, .abort_intent = xfs_refcount_update_abort_intent, - .log_item = xfs_refcount_update_log_item, .create_done = xfs_refcount_update_create_done, .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_update_finish_cleanup, + .finish_cleanup = xfs_refcount_finish_one_cleanup, .cancel_item = xfs_refcount_update_cancel_item, }; @@ -443,28 +420,27 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { * Process a refcount update intent item that was recovered from the log. * We need to update the refcountbt. */ -int -xfs_cui_recover( - struct xfs_trans *parent_tp, - struct xfs_cui_log_item *cuip) +STATIC int +xfs_cui_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - int i; - int error = 0; - unsigned int refc_type; + struct xfs_bmbt_irec irec; + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_phys_extent *refc; - xfs_fsblock_t startblock_fsb; - bool op_ok; struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - enum xfs_refcount_intent_type type; + struct xfs_mount *mp = parent_tp->t_mountp; + xfs_fsblock_t startblock_fsb; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; - struct xfs_bmbt_irec irec; + unsigned int refc_type; + bool op_ok; bool requeue_only = false; - struct xfs_mount *mp = parent_tp->t_mountp; - - ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); + enum xfs_refcount_intent_type type; + int i; + int error = 0; /* * First check the validity of the extents described by the @@ -495,7 +471,6 @@ xfs_cui_recover( * This will pull the CUI from the AIL and * free the memory associated with it. */ - set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); xfs_cui_release(cuip); return -EFSCORRUPTED; } @@ -579,7 +554,6 @@ xfs_cui_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); xfs_defer_move(parent_tp, tp); error = xfs_trans_commit(tp); return error; @@ -590,3 +564,117 @@ abort_error: xfs_trans_cancel(tp); return error; } + +STATIC bool +xfs_cui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return CUI_ITEM(lip)->cui_format.cui_id == intent_id; +} + +static const struct xfs_item_ops xfs_cui_item_ops = { + .iop_size = xfs_cui_item_size, + .iop_format = xfs_cui_item_format, + .iop_unpin = xfs_cui_item_unpin, + .iop_release = xfs_cui_item_release, + .iop_recover = xfs_cui_item_recover, + .iop_match = xfs_cui_item_match, +}; + +/* + * Copy an CUI format buffer from the given buf, and into the destination + * CUI format structure. The CUI/CUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_cui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_cui_log_format *dst_cui_fmt) +{ + struct xfs_cui_log_format *src_cui_fmt; + uint len; + + src_cui_fmt = buf->i_addr; + len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + + if (buf->i_len == len) { + memcpy(dst_cui_fmt, src_cui_fmt, len); + return 0; + } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent refcount update + * item from the cui format structure which was logged on disk. + * It allocates an in-core cui, copies the extents from the format + * structure into it, and adds the cui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_cui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_cui_log_item *cuip; + struct xfs_cui_log_format *cui_formatp; + + cui_formatp = item->ri_buf[0].i_addr; + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); + if (error) { + xfs_cui_item_free(cuip); + return error; + } + atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn); + xfs_cui_release(cuip); + return 0; +} + +const struct xlog_recover_item_ops xlog_cui_item_ops = { + .item_type = XFS_LI_CUI, + .commit_pass2 = xlog_recover_cui_commit_pass2, +}; + +/* + * This routine is called when an CUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding CUI if it + * was still in the log. To do this it searches the AIL for the CUI with an id + * equal to that in the CUD format structure. If we find it we drop the CUD + * reference, which removes the CUI from the AIL and frees it. + */ +STATIC int +xlog_recover_cud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_cud_log_format *cud_formatp; + + cud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_CUI, cud_formatp->cud_cui_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_cud_item_ops = { + .item_type = XFS_LI_CUD, + .commit_pass2 = xlog_recover_cud_commit_pass2, +}; diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index e47530f30489..f4f2e836540b 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -33,11 +33,6 @@ struct kmem_zone; #define XFS_CUI_MAX_FAST_EXTENTS 16 /* - * Define CUI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_CUI_RECOVERED 1 - -/* * This is the "refcount update intent" log item. It is used to log * the fact that some reverse mappings need to change. It is used in * conjunction with the "refcount update done" log item described @@ -51,7 +46,6 @@ struct xfs_cui_log_item { struct xfs_log_item cui_item; atomic_t cui_refcount; atomic_t cui_next_extent; - unsigned long cui_flags; /* misc flags */ struct xfs_cui_log_format cui_format; }; @@ -77,9 +71,4 @@ struct xfs_cud_log_item { extern struct kmem_zone *xfs_cui_zone; extern struct kmem_zone *xfs_cud_zone; -struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); -void xfs_cui_item_free(struct xfs_cui_log_item *); -void xfs_cui_release(struct xfs_cui_log_item *); -int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip); - #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 4911b68f95dd..a86599db20a6 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -18,16 +18,20 @@ #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_rui_zone; kmem_zone_t *xfs_rud_zone; +static const struct xfs_item_ops xfs_rui_item_ops; + static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rui_log_item, rui_item); } -void +STATIC void xfs_rui_item_free( struct xfs_rui_log_item *ruip) { @@ -44,13 +48,13 @@ xfs_rui_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the RUI. */ -void +STATIC void xfs_rui_release( struct xfs_rui_log_item *ruip) { ASSERT(atomic_read(&ruip->rui_refcount) > 0); if (atomic_dec_and_test(&ruip->rui_refcount)) { - xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); xfs_rui_item_free(ruip); } } @@ -122,17 +126,10 @@ xfs_rui_item_release( xfs_rui_release(RUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_rui_item_ops = { - .iop_size = xfs_rui_item_size, - .iop_format = xfs_rui_item_format, - .iop_unpin = xfs_rui_item_unpin, - .iop_release = xfs_rui_item_release, -}; - /* * Allocate and initialize an rui item with the given number of extents. */ -struct xfs_rui_log_item * +STATIC struct xfs_rui_log_item * xfs_rui_init( struct xfs_mount *mp, uint nextents) @@ -160,7 +157,7 @@ xfs_rui_init( * RUI format structure. The RUI/RUD items were designed not to need any * special alignment handling. */ -int +STATIC int xfs_rui_copy_format( struct xfs_log_iovec *buf, struct xfs_rui_log_format *dst_rui_fmt) @@ -352,41 +349,16 @@ xfs_rmap_update_diff_items( XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); } -/* Get an RUI. */ -STATIC void * -xfs_rmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_rui_log_item *ruip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - ruip = xfs_rui_init(tp->t_mountp, count); - ASSERT(ruip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &ruip->rui_item); - return ruip; -} - /* Log rmap updates in the intent item. */ STATIC void xfs_rmap_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_rui_log_item *ruip, + struct xfs_rmap_intent *rmap) { - struct xfs_rui_log_item *ruip = intent; - struct xfs_rmap_intent *rmap; uint next_extent; struct xfs_map_extent *map; - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); @@ -406,58 +378,64 @@ xfs_rmap_update_log_item( rmap->ri_bmap.br_state); } +static struct xfs_log_item * +xfs_rmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); + struct xfs_rmap_intent *rmap; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &ruip->rui_item); + if (sort) + list_sort(mp, items, xfs_rmap_update_diff_items); + list_for_each_entry(rmap, items, ri_list) + xfs_rmap_update_log_item(tp, ruip, rmap); + return &ruip->rui_item; +} + /* Get an RUD so we can process all the deferred rmap updates. */ -STATIC void * +static struct xfs_log_item * xfs_rmap_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_rud(tp, intent); + return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; } /* Process a deferred rmap update. */ STATIC int xfs_rmap_update_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_rmap_intent *rmap; int error; rmap = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, done_item, - rmap->ri_type, - rmap->ri_owner, rmap->ri_whichfork, - rmap->ri_bmap.br_startoff, - rmap->ri_bmap.br_startblock, - rmap->ri_bmap.br_blockcount, - rmap->ri_bmap.br_state, - (struct xfs_btree_cur **)state); + error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), + rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork, + rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock, + rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state, + state); kmem_free(rmap); return error; } -/* Clean up after processing deferred rmaps. */ -STATIC void -xfs_rmap_update_finish_cleanup( - struct xfs_trans *tp, - void *state, - int error) -{ - struct xfs_btree_cur *rcur = state; - - xfs_rmap_finish_one_cleanup(tp, rcur, error); -} - /* Abort all pending RUIs. */ STATIC void xfs_rmap_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_rui_release(intent); + xfs_rui_release(RUI_ITEM(intent)); } /* Cancel a deferred rmap update. */ @@ -473,13 +451,11 @@ xfs_rmap_update_cancel_item( const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .diff_items = xfs_rmap_update_diff_items, .create_intent = xfs_rmap_update_create_intent, .abort_intent = xfs_rmap_update_abort_intent, - .log_item = xfs_rmap_update_log_item, .create_done = xfs_rmap_update_create_done, .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_update_finish_cleanup, + .finish_cleanup = xfs_rmap_finish_one_cleanup, .cancel_item = xfs_rmap_update_cancel_item, }; @@ -487,24 +463,24 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { * Process an rmap update intent item that was recovered from the log. * We need to update the rmapbt. */ -int -xfs_rui_recover( - struct xfs_mount *mp, - struct xfs_rui_log_item *ruip) +STATIC int +xfs_rui_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - int i; - int error = 0; + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_map_extent *rmap; - xfs_fsblock_t startblock_fsb; - bool op_ok; struct xfs_rud_log_item *rudp; - enum xfs_rmap_intent_type type; - int whichfork; - xfs_exntst_t state; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - - ASSERT(!test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)); + struct xfs_mount *mp = parent_tp->t_mountp; + xfs_fsblock_t startblock_fsb; + enum xfs_rmap_intent_type type; + xfs_exntst_t state; + bool op_ok; + int i; + int whichfork; + int error = 0; /* * First check the validity of the extents described by the @@ -539,7 +515,6 @@ xfs_rui_recover( * This will pull the RUI from the AIL and * free the memory associated with it. */ - set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); xfs_rui_release(ruip); return -EFSCORRUPTED; } @@ -597,7 +572,6 @@ xfs_rui_recover( } xfs_rmap_finish_one_cleanup(tp, rcur, error); - set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); error = xfs_trans_commit(tp); return error; @@ -606,3 +580,90 @@ abort_error: xfs_trans_cancel(tp); return error; } + +STATIC bool +xfs_rui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return RUI_ITEM(lip)->rui_format.rui_id == intent_id; +} + +static const struct xfs_item_ops xfs_rui_item_ops = { + .iop_size = xfs_rui_item_size, + .iop_format = xfs_rui_item_format, + .iop_unpin = xfs_rui_item_unpin, + .iop_release = xfs_rui_item_release, + .iop_recover = xfs_rui_item_recover, + .iop_match = xfs_rui_item_match, +}; + +/* + * This routine is called to create an in-core extent rmap update + * item from the rui format structure which was logged on disk. + * It allocates an in-core rui, copies the extents from the format + * structure into it, and adds the rui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_rui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_rui_log_item *ruip; + struct xfs_rui_log_format *rui_formatp; + + rui_formatp = item->ri_buf[0].i_addr; + + ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); + if (error) { + xfs_rui_item_free(ruip); + return error; + } + atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn); + xfs_rui_release(ruip); + return 0; +} + +const struct xlog_recover_item_ops xlog_rui_item_ops = { + .item_type = XFS_LI_RUI, + .commit_pass2 = xlog_recover_rui_commit_pass2, +}; + +/* + * This routine is called when an RUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding RUI if it + * was still in the log. To do this it searches the AIL for the RUI with an id + * equal to that in the RUD format structure. If we find it we drop the RUD + * reference, which removes the RUI from the AIL and frees it. + */ +STATIC int +xlog_recover_rud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_rud_log_format *rud_formatp; + + rud_formatp = item->ri_buf[0].i_addr; + ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); + + xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_rud_item_ops = { + .item_type = XFS_LI_RUD, + .commit_pass2 = xlog_recover_rud_commit_pass2, +}; diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 8708e4a5aa5c..31e6cdfff71f 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -36,11 +36,6 @@ struct kmem_zone; #define XFS_RUI_MAX_FAST_EXTENTS 16 /* - * Define RUI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_RUI_RECOVERED 1 - -/* * This is the "rmap update intent" log item. It is used to log the fact that * some reverse mappings need to change. It is used in conjunction with the * "rmap update done" log item described below. @@ -52,7 +47,6 @@ struct xfs_rui_log_item { struct xfs_log_item rui_item; atomic_t rui_refcount; atomic_t rui_next_extent; - unsigned long rui_flags; /* misc flags */ struct xfs_rui_log_format rui_format; }; @@ -77,11 +71,4 @@ struct xfs_rud_log_item { extern struct kmem_zone *xfs_rui_zone; extern struct kmem_zone *xfs_rud_zone; -struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint); -int xfs_rui_copy_format(struct xfs_log_iovec *buf, - struct xfs_rui_log_format *dst_rui_fmt); -void xfs_rui_item_free(struct xfs_rui_log_item *); -void xfs_rui_release(struct xfs_rui_log_item *); -int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip); - #endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index a123cd8267d9..23a517a68c4b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,39 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif +enum xfs_dax_mode { + XFS_DAX_INODE = 0, + XFS_DAX_ALWAYS = 1, + XFS_DAX_NEVER = 2, +}; + +static void +xfs_mount_set_dax_mode( + struct xfs_mount *mp, + enum xfs_dax_mode mode) +{ + switch (mode) { + case XFS_DAX_INODE: + mp->m_flags &= ~(XFS_MOUNT_DAX_ALWAYS | XFS_MOUNT_DAX_NEVER); + break; + case XFS_DAX_ALWAYS: + mp->m_flags |= XFS_MOUNT_DAX_ALWAYS; + mp->m_flags &= ~XFS_MOUNT_DAX_NEVER; + break; + case XFS_DAX_NEVER: + mp->m_flags |= XFS_MOUNT_DAX_NEVER; + mp->m_flags &= ~XFS_MOUNT_DAX_ALWAYS; + break; + } +} + +static const struct constant_table dax_param_enums[] = { + {"inode", XFS_DAX_INODE }, + {"always", XFS_DAX_ALWAYS }, + {"never", XFS_DAX_NEVER }, + {} +}; + /* * Table driven mount option parser. */ @@ -59,7 +92,7 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -103,6 +136,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("discard", Opt_discard), fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), + fsparam_enum("dax", Opt_dax_enum, dax_param_enums), {} }; @@ -129,7 +163,8 @@ xfs_fs_show_options( { XFS_MOUNT_GRPID, ",grpid" }, { XFS_MOUNT_DISCARD, ",discard" }, { XFS_MOUNT_LARGEIO, ",largeio" }, - { XFS_MOUNT_DAX, ",dax" }, + { XFS_MOUNT_DAX_ALWAYS, ",dax=always" }, + { XFS_MOUNT_DAX_NEVER, ",dax=never" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); @@ -772,7 +807,8 @@ xfs_fs_statfs( statp->f_blocks = sbp->sb_dblocks - lsize; spin_unlock(&mp->m_sb_lock); - statp->f_bfree = fdblocks - mp->m_alloc_set_aside; + /* make sure statp->f_bfree does not underflow */ + statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0); statp->f_bavail = statp->f_bfree; fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); @@ -838,8 +874,10 @@ xfs_restore_resvblks(struct xfs_mount *mp) * there is no log replay required to write the inodes to disk - this is the * primary difference between a sync and a quiesce. * - * Note: xfs_log_quiesce() stops background log work - the callers must ensure - * it is started again when appropriate. + * We cancel log work early here to ensure all transactions the log worker may + * run have finished before we clean up and log the superblock and write an + * unmount record. The unfreeze process is responsible for restarting the log + * worker correctly. */ void xfs_quiesce_attr( @@ -847,9 +885,7 @@ xfs_quiesce_attr( { int error = 0; - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); + cancel_delayed_work_sync(&mp->m_log->l_work); /* force the log to unpin objects from the now complete transactions */ xfs_log_force(mp, XFS_LOG_SYNC); @@ -863,12 +899,6 @@ xfs_quiesce_attr( if (error) xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - xfs_log_quiesce(mp); } @@ -1261,7 +1291,10 @@ xfs_fc_parse_param( return 0; #ifdef CONFIG_FS_DAX case Opt_dax: - mp->m_flags |= XFS_MOUNT_DAX; + xfs_mount_set_dax_mode(mp, XFS_DAX_ALWAYS); + return 0; + case Opt_dax_enum: + xfs_mount_set_dax_mode(mp, result.uint_32); return 0; #endif default: @@ -1454,7 +1487,7 @@ xfs_fc_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= SB_I_VERSION; - if (mp->m_flags & XFS_MOUNT_DAX) { + if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) { bool rtdev_is_dax = false, datadev_is_dax; xfs_warn(mp, @@ -1468,7 +1501,7 @@ xfs_fc_fill_super( if (!rtdev_is_dax && !datadev_is_dax) { xfs_alert(mp, "DAX unsupported by block device. Turning off DAX."); - mp->m_flags &= ~XFS_MOUNT_DAX; + xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); } if (xfs_sb_version_hasreflink(&mp->m_sb)) { xfs_alert(mp, @@ -1754,7 +1787,6 @@ static int xfs_init_fs_context( INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); - atomic_set(&mp->m_active_trans, 0); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 13fb4b919648..8e88a7ca387e 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -243,8 +243,7 @@ xfs_symlink( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - if (resblks) - resblks -= XFS_IALLOC_SPACE_RES(mp); + resblks -= XFS_IALLOC_SPACE_RES(mp); /* * If the symlink will fit into the inode, write it inline. */ @@ -252,7 +251,7 @@ xfs_symlink( xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); ip->i_d.di_size = pathlen; - ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ip->i_df.if_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); } else { int offset; @@ -265,8 +264,7 @@ xfs_symlink( if (error) goto out_trans_cancel; - if (resblks) - resblks -= fs_blocks; + resblks -= fs_blocks; ip->i_d.di_size = pathlen; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -386,7 +384,7 @@ xfs_inactive_symlink_rmt( * either 1 or 2 extents and that we can * free them all in one bunmapi call. */ - ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + ASSERT(ip->i_df.if_nextents > 0 && ip->i_df.if_nextents <= 2); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a4323a63438d..460136628a79 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1897,8 +1897,8 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->which = which; __entry->ino = ip->i_ino; - __entry->format = ip->i_d.di_format; - __entry->nex = ip->i_d.di_nextents; + __entry->format = ip->i_df.if_format; + __entry->nex = ip->i_df.if_nextents; __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 28b983ff8b11..3c94e5ff4316 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -68,7 +68,6 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); trace_xfs_trans_free(tp, _RET_IP_); - atomic_dec(&tp->t_mountp->m_active_trans); if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); @@ -125,8 +124,6 @@ xfs_trans_dup( xfs_defer_move(ntp, tp); xfs_trans_dup_dqinfo(tp, ntp); - - atomic_inc(&tp->t_mountp->m_active_trans); return ntp; } @@ -275,7 +272,6 @@ xfs_trans_alloc( */ WARN_ON(resp->tr_logres > 0 && mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); - atomic_inc(&mp->m_active_trans); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; @@ -299,20 +295,19 @@ xfs_trans_alloc( /* * Create an empty transaction with no reservation. This is a defensive - * mechanism for routines that query metadata without actually modifying - * them -- if the metadata being queried is somehow cross-linked (think a - * btree block pointer that points higher in the tree), we risk deadlock. - * However, blocks grabbed as part of a transaction can be re-grabbed. - * The verifiers will notice the corrupt block and the operation will fail - * back to userspace without deadlocking. + * mechanism for routines that query metadata without actually modifying them -- + * if the metadata being queried is somehow cross-linked (think a btree block + * pointer that points higher in the tree), we risk deadlock. However, blocks + * grabbed as part of a transaction can be re-grabbed. The verifiers will + * notice the corrupt block and the operation will fail back to userspace + * without deadlocking. * - * Note the zero-length reservation; this transaction MUST be cancelled - * without any dirty data. + * Note the zero-length reservation; this transaction MUST be cancelled without + * any dirty data. * - * Callers should obtain freeze protection to avoid two conflicts with fs - * freezing: (1) having active transactions trip the m_active_trans ASSERTs; - * and (2) grabbing buffers at the same time that freeze is trying to drain - * the buffer LRU list. + * Callers should obtain freeze protection to avoid a conflict with fs freezing + * where we can be grabbing buffers at the same time that freeze is trying to + * drain the buffer LRU list. */ int xfs_trans_alloc_empty( @@ -534,57 +529,9 @@ xfs_trans_apply_sb_deltas( sizeof(sbp->sb_frextents) - 1); } -STATIC int -xfs_sb_mod8( - uint8_t *field, - int8_t delta) -{ - int8_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - -STATIC int -xfs_sb_mod32( - uint32_t *field, - int32_t delta) -{ - int32_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - -STATIC int -xfs_sb_mod64( - uint64_t *field, - int64_t delta) -{ - int64_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - /* - * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations - * and apply superblock counter changes to the in-core superblock. The + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations and + * apply superblock counter changes to the in-core superblock. The * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT * applied to the in-core superblock. The idea is that that has already been * done. @@ -593,7 +540,12 @@ xfs_sb_mod64( * used block counts are not updated in the on disk superblock. In this case, * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we * still need to update the incore superblock with the changes. + * + * Deltas for the inode count are +/-64, hence we use a large batch size of 128 + * so we don't need to take the counter lock on every update. */ +#define XFS_ICOUNT_BATCH 128 + void xfs_trans_unreserve_and_mod_sb( struct xfs_trans *tp) @@ -629,20 +581,21 @@ xfs_trans_unreserve_and_mod_sb( /* apply the per-cpu counters */ if (blkdelta) { error = xfs_mod_fdblocks(mp, blkdelta, rsvd); - if (error) - goto out; + ASSERT(!error); } if (idelta) { - error = xfs_mod_icount(mp, idelta); - if (error) - goto out_undo_fdblocks; + percpu_counter_add_batch(&mp->m_icount, idelta, + XFS_ICOUNT_BATCH); + if (idelta < 0) + ASSERT(__percpu_counter_compare(&mp->m_icount, 0, + XFS_ICOUNT_BATCH) >= 0); } if (ifreedelta) { - error = xfs_mod_ifree(mp, ifreedelta); - if (error) - goto out_undo_icount; + percpu_counter_add(&mp->m_ifree, ifreedelta); + if (ifreedelta < 0) + ASSERT(percpu_counter_compare(&mp->m_ifree, 0) >= 0); } if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) @@ -650,95 +603,23 @@ xfs_trans_unreserve_and_mod_sb( /* apply remaining deltas */ spin_lock(&mp->m_sb_lock); - if (rtxdelta) { - error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); - if (error) - goto out_undo_ifree; - } - - if (tp->t_dblocks_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); - if (error) - goto out_undo_frextents; - } - if (tp->t_agcount_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); - if (error) - goto out_undo_dblocks; - } - if (tp->t_imaxpct_delta != 0) { - error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); - if (error) - goto out_undo_agcount; - } - if (tp->t_rextsize_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, - tp->t_rextsize_delta); - if (error) - goto out_undo_imaxpct; - } - if (tp->t_rbmblocks_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, - tp->t_rbmblocks_delta); - if (error) - goto out_undo_rextsize; - } - if (tp->t_rblocks_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); - if (error) - goto out_undo_rbmblocks; - } - if (tp->t_rextents_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_rextents, - tp->t_rextents_delta); - if (error) - goto out_undo_rblocks; - } - if (tp->t_rextslog_delta != 0) { - error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, - tp->t_rextslog_delta); - if (error) - goto out_undo_rextents; - } + mp->m_sb.sb_frextents += rtxdelta; + mp->m_sb.sb_dblocks += tp->t_dblocks_delta; + mp->m_sb.sb_agcount += tp->t_agcount_delta; + mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; + mp->m_sb.sb_rextsize += tp->t_rextsize_delta; + mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta; + mp->m_sb.sb_rblocks += tp->t_rblocks_delta; + mp->m_sb.sb_rextents += tp->t_rextents_delta; + mp->m_sb.sb_rextslog += tp->t_rextslog_delta; spin_unlock(&mp->m_sb_lock); - return; -out_undo_rextents: - if (tp->t_rextents_delta) - xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); -out_undo_rblocks: - if (tp->t_rblocks_delta) - xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); -out_undo_rbmblocks: - if (tp->t_rbmblocks_delta) - xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); -out_undo_rextsize: - if (tp->t_rextsize_delta) - xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); -out_undo_imaxpct: - if (tp->t_rextsize_delta) - xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); -out_undo_agcount: - if (tp->t_agcount_delta) - xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); -out_undo_dblocks: - if (tp->t_dblocks_delta) - xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); -out_undo_frextents: - if (rtxdelta) - xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); -out_undo_ifree: - spin_unlock(&mp->m_sb_lock); - if (ifreedelta) - xfs_mod_ifree(mp, -ifreedelta); -out_undo_icount: - if (idelta) - xfs_mod_icount(mp, -idelta); -out_undo_fdblocks: - if (blkdelta) - xfs_mod_fdblocks(mp, -blkdelta, rsvd); -out: - ASSERT(error == 0); + /* + * Debug checks outside of the spinlock so they don't lock up the + * machine if they fail. + */ + ASSERT(mp->m_sb.sb_imax_pct >= 0); + ASSERT(mp->m_sb.sb_rextslog >= 0); return; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 752c7fef9de7..8308bf6d7e40 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -59,12 +59,14 @@ struct xfs_log_item { #define XFS_LI_ABORTED 1 #define XFS_LI_FAILED 2 #define XFS_LI_DIRTY 3 /* log item dirty in transaction */ +#define XFS_LI_RECOVERED 4 /* log intent item has been recovered */ #define XFS_LI_FLAGS \ { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ { (1 << XFS_LI_ABORTED), "ABORTED" }, \ { (1 << XFS_LI_FAILED), "FAILED" }, \ - { (1 << XFS_LI_DIRTY), "DIRTY" } + { (1 << XFS_LI_DIRTY), "DIRTY" }, \ + { (1 << XFS_LI_RECOVERED), "RECOVERED" } struct xfs_item_ops { unsigned flags; @@ -77,6 +79,8 @@ struct xfs_item_ops { void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); void (*iop_error)(struct xfs_log_item *, xfs_buf_t *); + int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp); + bool (*iop_match)(struct xfs_log_item *item, uint64_t id); }; /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 564253550b75..ac5019361a13 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -345,6 +345,45 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } +/* + * Requeue a failed buffer for writeback. + * + * We clear the log item failed state here as well, but we have to be careful + * about reference counts because the only active reference counts on the buffer + * may be the failed log items. Hence if we clear the log item failed state + * before queuing the buffer for IO we can release all active references to + * the buffer and free it, leading to use after free problems in + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which + * order we process them in - the buffer is locked, and we own the buffer list + * so nothing on them is going to change while we are performing this action. + * + * Hence we can safely queue the buffer for IO before we clear the failed log + * item state, therefore always having an active reference to the buffer and + * avoiding the transient zero-reference state that leads to use-after-free. + */ +static inline int +xfsaild_resubmit_item( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_buf *bp = lip->li_buf; + + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_delwri_queue(bp, buffer_list)) { + xfs_buf_unlock(bp); + return XFS_ITEM_FLUSHING; + } + + /* protected by ail_lock */ + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + xfs_clear_li_failed(lip); + + xfs_buf_unlock(bp); + return XFS_ITEM_SUCCESS; +} + static inline uint xfsaild_push_item( struct xfs_ail *ailp, @@ -365,6 +404,8 @@ xfsaild_push_item( */ if (!lip->li_ops->iop_push) return XFS_ITEM_PINNED; + if (test_bit(XFS_LI_FAILED, &lip->li_flags)) + return xfsaild_resubmit_item(lip, &ailp->ail_buf_list); return lip->li_ops->iop_push(lip, &ailp->ail_buf_list); } @@ -774,6 +815,17 @@ xfs_trans_ail_update_bulk( xfs_ail_update_finish(ailp, tail_lsn); } +/* Insert a log item into the AIL. */ +void +xfs_trans_ail_insert( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + spin_lock(&ailp->ail_lock); + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); +} + /* * Delete one log item from the AIL. * @@ -800,39 +852,19 @@ xfs_ail_delete_one( return 0; } -/** - * Remove a log items from the AIL - * - * @xfs_trans_ail_delete_bulk takes an array of log items that all need to - * removed from the AIL. The caller is already holding the AIL lock, and done - * all the checks necessary to ensure the items passed in via @log_items are - * ready for deletion. This includes checking that the items are in the AIL. - * - * For each log item to be removed, unlink it from the AIL, clear the IN_AIL - * flag from the item and reset the item's lsn to 0. If we remove the first - * item in the AIL, update the log tail to match the new minimum LSN in the - * AIL. - * - * This function will not drop the AIL lock until all items are removed from - * the AIL to minimise the amount of lock traffic on the AIL. This does not - * greatly increase the AIL hold time, but does significantly reduce the amount - * of traffic on the lock, especially during IO completion. - * - * This function must be called with the AIL lock held. The lock is dropped - * before returning. - */ void xfs_trans_ail_delete( - struct xfs_ail *ailp, struct xfs_log_item *lip, int shutdown_type) { + struct xfs_ail *ailp = lip->li_ailp; struct xfs_mount *mp = ailp->ail_mount; xfs_lsn_t tail_lsn; + spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (!XFS_FORCED_SHUTDOWN(mp)) { + if (shutdown_type && !XFS_FORCED_SHUTDOWN(mp)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); @@ -841,6 +873,7 @@ xfs_trans_ail_delete( return; } + /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index d1b9869bc5fa..c0f73b82c055 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -388,7 +388,7 @@ xfs_trans_apply_dquot_deltas( */ if (d->d_id) { xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); - xfs_qm_adjust_dqtimers(tp->t_mountp, d); + xfs_qm_adjust_dqtimers(tp->t_mountp, dqp); } dqp->dq_flags |= XFS_DQ_DIRTY; @@ -591,7 +591,7 @@ xfs_trans_dqresv( xfs_dqlock(dqp); - defq = xfs_get_defquota(dqp, q); + defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); if (flags & XFS_TRANS_DQ_RES_BLKS) { hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); @@ -602,7 +602,7 @@ xfs_trans_dqresv( softlimit = defq->bsoftlimit; timer = be32_to_cpu(dqp->q_core.d_btimer); warns = be16_to_cpu(dqp->q_core.d_bwarns); - warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; + warnlimit = defq->bwarnlimit; resbcountp = &dqp->q_res_bcount; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); @@ -614,7 +614,7 @@ xfs_trans_dqresv( softlimit = defq->rtbsoftlimit; timer = be32_to_cpu(dqp->q_core.d_rtbtimer); warns = be16_to_cpu(dqp->q_core.d_rtbwarns); - warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; + warnlimit = defq->rtbwarnlimit; resbcountp = &dqp->q_res_rtbcount; } @@ -650,7 +650,7 @@ xfs_trans_dqresv( total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); - warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; + warnlimit = defq->iwarnlimit; hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); if (!hardlimit) hardlimit = defq->ihardlimit; @@ -711,7 +711,7 @@ xfs_trans_dqresv( error_return: xfs_dqunlock(dqp); - if (flags & XFS_QMOPT_ENOSPC) + if (XFS_QM_ISPDQ(dqp)) return -ENOSPC; return -EDQUOT; } @@ -751,8 +751,7 @@ xfs_trans_reserve_quota_bydquots( ASSERT(flags & XFS_QMOPT_RESBLK_MASK); if (udqp) { - error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, - (flags & ~XFS_QMOPT_ENOSPC)); + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags); if (error) return error; } @@ -803,16 +802,12 @@ xfs_trans_reserve_quota_nblks( if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; - if (XFS_IS_PQUOTA_ON(mp)) - flags |= XFS_QMOPT_ENOSPC; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == - XFS_TRANS_DQ_RES_RTBLKS || - (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == - XFS_TRANS_DQ_RES_BLKS); + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_RTBLKS || + (flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_BLKS); /* * Reserve nblks against these dquots, with trans as the mediator. diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 35655eac01a6..3004aeac9110 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -91,26 +91,13 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } +void xfs_trans_ail_insert(struct xfs_ail *ailp, struct xfs_log_item *lip, + xfs_lsn_t lsn); + xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn) __releases(ailp->ail_lock); -void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type); - -static inline void -xfs_trans_ail_remove( - struct xfs_log_item *lip, - int shutdown_type) -{ - struct xfs_ail *ailp = lip->li_ailp; - - spin_lock(&ailp->ail_lock); - /* xfs_trans_ail_delete() drops the AIL lock */ - if (test_bit(XFS_LI_IN_AIL, &lip->li_flags)) - xfs_trans_ail_delete(ailp, lip, shutdown_type); - else - spin_unlock(&ailp->ail_lock); -} +void xfs_trans_ail_delete(struct xfs_log_item *lip, int shutdown_type); void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index fc5d7276026e..bca48b308c02 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -12,7 +12,6 @@ #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_acl.h" -#include "xfs_da_format.h" #include "xfs_da_btree.h" #include <linux/posix_acl_xattr.h> |