diff options
Diffstat (limited to 'fs')
184 files changed, 24668 insertions, 2805 deletions
diff --git a/fs/read_write.c b/fs/read_write.c index 2115d1f40bd5..0f862c0ee1d4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1667,6 +1667,7 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) return 0; } +EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) diff --git a/fs/remap_range.c b/fs/remap_range.c index de07f978ce3e..28246dfc8485 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -99,8 +99,7 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, return 0; } -static int remap_verify_area(struct file *file, loff_t pos, loff_t len, - bool write) +int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { int mask = write ? MAY_WRITE : MAY_READ; loff_t tmp; @@ -118,6 +117,7 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, return fsnotify_file_area_perm(file, mask, &pos, len); } +EXPORT_SYMBOL_GPL(remap_verify_area); /* * Ensure that we don't remap a partial EOF block in the middle of something diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index c5a35e32adf0..c50447548d65 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -34,6 +34,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_dir2_node.o \ xfs_dir2_sf.o \ xfs_dquot_buf.o \ + xfs_exchmaps.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ xfs_iext_tree.o \ @@ -41,6 +42,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_buf.o \ xfs_log_rlimit.o \ xfs_ag_resv.o \ + xfs_parent.o \ xfs_rmap.o \ xfs_rmap_btree.o \ xfs_refcount.o \ @@ -49,6 +51,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_symlink_remote.o \ xfs_trans_inode.o \ xfs_trans_resv.o \ + xfs_trans_space.o \ xfs_types.o \ ) # xfs_rtbitmap is shared with libxfs @@ -67,6 +70,7 @@ xfs-y += xfs_aops.o \ xfs_dir2_readdir.o \ xfs_discard.o \ xfs_error.o \ + xfs_exchrange.o \ xfs_export.o \ xfs_extent_busy.o \ xfs_file.o \ @@ -74,6 +78,7 @@ xfs-y += xfs_aops.o \ xfs_fsmap.o \ xfs_fsops.o \ xfs_globals.o \ + xfs_handle.o \ xfs_health.o \ xfs_icache.o \ xfs_ioctl.o \ @@ -101,6 +106,7 @@ xfs-y += xfs_log.o \ xfs_buf_item.o \ xfs_buf_item_recover.o \ xfs_dquot_item_recover.o \ + xfs_exchmaps_item.o \ xfs_extfree_item.o \ xfs_attr_item.o \ xfs_icreate_item.o \ @@ -157,11 +163,13 @@ xfs-y += $(addprefix scrub/, \ common.o \ dabtree.o \ dir.o \ + dirtree.o \ fscounters.o \ health.o \ ialloc.o \ inode.o \ iscan.o \ + listxattr.o \ nlinks.o \ parent.o \ readdir.o \ @@ -170,6 +178,7 @@ xfs-y += $(addprefix scrub/, \ scrub.o \ symlink.o \ xfarray.o \ + xfblob.o \ xfile.o \ ) @@ -191,23 +200,32 @@ ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ alloc_repair.o \ + attr_repair.o \ bmap_repair.o \ cow_repair.o \ + dir_repair.o \ + dirtree_repair.o \ + findparent.o \ fscounters_repair.o \ ialloc_repair.o \ inode_repair.o \ newbt.o \ nlinks_repair.o \ + orphanage.o \ + parent_repair.o \ rcbag_btree.o \ rcbag.o \ reap.o \ refcount_repair.o \ repair.o \ rmap_repair.o \ + symlink_repair.o \ + tempfile.o \ ) xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtbitmap_repair.o \ + rtsummary_repair.o \ ) xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index dc1873f76bff..240e079cb3fb 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -194,7 +194,7 @@ xfs_initialize_perag_data( pag = xfs_perag_get(mp, index); error = xfs_alloc_read_agf(pag, NULL, 0, NULL); if (!error) - error = xfs_ialloc_read_agi(pag, NULL, NULL); + error = xfs_ialloc_read_agi(pag, NULL, 0, NULL); if (error) { xfs_perag_put(pag); return error; @@ -931,7 +931,7 @@ xfs_ag_shrink_space( int error, err2; ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(pag, *tpp, &agibp); + error = xfs_ialloc_read_agi(pag, *tpp, 0, &agibp); if (error) return error; @@ -963,9 +963,7 @@ xfs_ag_shrink_space( * Disable perag reservations so it doesn't cause the allocation request * to fail. We'll reestablish reservation before we return. */ - error = xfs_ag_resv_free(pag); - if (error) - return error; + xfs_ag_resv_free(pag); /* internal log shouldn't also show up in the free space btrees */ error = xfs_alloc_vextent_exact_bno(&args, @@ -1062,7 +1060,7 @@ xfs_ag_extend_space( ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(pag, tp, &bp); + error = xfs_ialloc_read_agi(pag, tp, 0, &bp); if (error) return error; @@ -1119,7 +1117,7 @@ xfs_ag_get_geometry( int error; /* Lock the AG headers. */ - error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); + error = xfs_ialloc_read_agi(pag, NULL, 0, &agi_bp); if (error) return error; error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp); diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index da1057bd0e60..216423df939e 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -126,14 +126,13 @@ xfs_ag_resv_needed( } /* Clean out a reservation */ -static int +static void __xfs_ag_resv_free( struct xfs_perag *pag, enum xfs_ag_resv_type type) { struct xfs_ag_resv *resv; xfs_extlen_t oldresv; - int error; trace_xfs_ag_resv_free(pag, type, 0); @@ -149,30 +148,19 @@ __xfs_ag_resv_free( oldresv = resv->ar_orig_reserved; else oldresv = resv->ar_reserved; - error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); + xfs_add_fdblocks(pag->pag_mount, oldresv); resv->ar_reserved = 0; resv->ar_asked = 0; resv->ar_orig_reserved = 0; - - if (error) - trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, - error, _RET_IP_); - return error; } /* Free a per-AG reservation. */ -int +void xfs_ag_resv_free( struct xfs_perag *pag) { - int error; - int err2; - - error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); - err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); - if (err2 && !error) - error = err2; - return error; + __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); + __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); } static int @@ -216,7 +204,7 @@ __xfs_ag_resv_init( if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) error = -ENOSPC; else - error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); + error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index b74b210008ea..ff20ed93de77 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -6,7 +6,7 @@ #ifndef __XFS_AG_RESV_H__ #define __XFS_AG_RESV_H__ -int xfs_ag_resv_free(struct xfs_perag *pag); +void xfs_ag_resv_free(struct xfs_perag *pag); int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp); bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 9da52e92172a..6cb8b2ddc541 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -79,7 +79,7 @@ xfs_prealloc_blocks( } /* - * The number of blocks per AG that we withhold from xfs_mod_fdblocks to + * The number of blocks per AG that we withhold from xfs_dec_fdblocks to * guarantee that we can refill the AGFL prior to allocating space in a nearly * full AG. Although the space described by the free space btrees, the * blocks used by the freesp btrees themselves, and the blocks owned by the @@ -89,7 +89,7 @@ xfs_prealloc_blocks( * until the fs goes down, we subtract this many AG blocks from the incore * fdblocks to ensure user allocation does not overcommit the space the * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to - * withhold space from xfs_mod_fdblocks, so we do not account for that here. + * withhold space from xfs_dec_fdblocks, so we do not account for that here. */ #define XFS_ALLOCBT_AGFL_RESERVE 4 diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 673a4b6d2e8d..430cd3244c14 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -26,6 +26,7 @@ #include "xfs_trace.h" #include "xfs_attr_item.h" #include "xfs_xattr.h" +#include "xfs_parent.h" struct kmem_cache *xfs_attr_intent_cache; @@ -87,6 +88,8 @@ xfs_attr_is_leaf( struct xfs_iext_cursor icur; struct xfs_bmbt_irec imap; + ASSERT(!xfs_need_iread_extents(ifp)); + if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS) return false; @@ -224,11 +227,21 @@ int xfs_attr_get_ilocked( struct xfs_da_args *args) { + int error; + xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; + /* + * The incore attr fork iext tree must be loaded for xfs_attr_is_leaf + * to work correctly. + */ + error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK); + if (error) + return error; + if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); if (xfs_attr_is_leaf(args->dp)) @@ -264,9 +277,11 @@ xfs_attr_get( if (xfs_is_shutdown(args->dp->i_mount)) return -EIO; + if (!args->owner) + args->owner = args->dp->i_ino; args->geo = args->dp->i_mount->m_attr_geo; args->whichfork = XFS_ATTR_FORK; - args->hashval = xfs_da_hashname(args->name, args->namelen); + xfs_attr_sethash(args); /* Entirely possible to look up a name which doesn't exist */ args->op_flags = XFS_DA_OP_OKNOENT; @@ -363,7 +378,7 @@ xfs_attr_try_sf_addname( * Commit the shortform mods, and we're done. * NOTE: this is also the error path (EEXIST, etc). */ - if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) + if (!error) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); if (xfs_has_wsync(dp->i_mount)) @@ -401,6 +416,50 @@ out: return error; } +/* Compute the hash value for a user/root/secure extended attribute */ +xfs_dahash_t +xfs_attr_hashname( + const uint8_t *name, + int namelen) +{ + return xfs_da_hashname(name, namelen); +} + +/* Compute the hash value for any extended attribute from any namespace. */ +xfs_dahash_t +xfs_attr_hashval( + struct xfs_mount *mp, + unsigned int attr_flags, + const uint8_t *name, + int namelen, + const void *value, + int valuelen) +{ + ASSERT(xfs_attr_check_namespace(attr_flags)); + + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_hashattr(mp, name, namelen, value, valuelen); + + return xfs_attr_hashname(name, namelen); +} + +/* + * PPTR_REPLACE operations require the caller to set the old and new names and + * values explicitly. Update the canonical fields to the new name and value + * here now that the removal phase has finished. + */ +static void +xfs_attr_update_pptr_replace_args( + struct xfs_da_args *args) +{ + ASSERT(args->new_namelen > 0); + args->name = args->new_name; + args->namelen = args->new_namelen; + args->value = args->new_value; + args->valuelen = args->new_valuelen; + xfs_attr_sethash(args); +} + /* * Handle the state change on completion of a multi-state attr operation. * @@ -418,14 +477,15 @@ xfs_attr_complete_op( enum xfs_delattr_state replace_state) { struct xfs_da_args *args = attr->xattri_da_args; - bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; + + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + replace_state = XFS_DAS_DONE; + else if (xfs_attr_intent_op(attr) == XFS_ATTRI_OP_FLAGS_PPTR_REPLACE) + xfs_attr_update_pptr_replace_args(args); args->op_flags &= ~XFS_DA_OP_REPLACE; args->attr_filter &= ~XFS_ATTR_INCOMPLETE; - if (do_replace) - return replace_state; - - return XFS_DAS_DONE; + return replace_state; } static int @@ -647,8 +707,8 @@ xfs_attr_leaf_remove_attr( int forkoff; int error; - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -679,7 +739,7 @@ xfs_attr_leaf_shrink( if (!xfs_attr_is_leaf(dp)) return 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp); if (error) return error; @@ -868,6 +928,11 @@ xfs_attr_lookup( return -ENOATTR; } + /* Prerequisite for xfs_attr_is_leaf */ + error = xfs_iread_extents(args->trans, args->dp, XFS_ATTR_FORK); + if (error) + return error; + if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_hasname(args, &bp); @@ -883,74 +948,72 @@ xfs_attr_lookup( return error; } -static void -xfs_attr_defer_add( - struct xfs_da_args *args, - unsigned int op_flags) +int +xfs_attr_add_fork( + struct xfs_inode *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ { + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; /* transaction pointer */ + unsigned int blks; /* space reservation */ + int error; /* error return value */ - struct xfs_attr_intent *new; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - new = kmem_cache_zalloc(xfs_attr_intent_cache, - GFP_KERNEL | __GFP_NOFAIL); - new->xattri_op_flags = op_flags; - new->xattri_da_args = args; + blks = XFS_ADDAFORK_SPACE_RES(mp); - switch (op_flags) { - case XFS_ATTRI_OP_FLAGS_SET: - new->xattri_dela_state = xfs_attr_init_add_state(args); - break; - case XFS_ATTRI_OP_FLAGS_REPLACE: - new->xattri_dela_state = xfs_attr_init_replace_state(args); - break; - case XFS_ATTRI_OP_FLAGS_REMOVE: - new->xattri_dela_state = xfs_attr_init_remove_state(args); - break; - default: - ASSERT(0); - } + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, + rsvd, &tp); + if (error) + return error; + + if (xfs_inode_has_attr_fork(ip)) + goto trans_cancel; + + error = xfs_bmap_add_attrfork(tp, ip, size, rsvd); + if (error) + goto trans_cancel; + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; - xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); - trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); +trans_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } /* - * Note: If args->value is NULL the attribute will be removed, just like the - * Linux ->setattr API. + * Make a change to the xattr structure. + * + * The caller must have initialized @args, attached dquots, and must not hold + * any ILOCKs. Reserved data blocks may be used if @rsvd is set. + * + * Returns -EEXIST for XFS_ATTRUPDATE_CREATE if the name already exists. + * Returns -ENOATTR for XFS_ATTRUPDATE_REMOVE if the name does not exist. + * Returns 0 on success, or a negative errno if something else went wrong. */ int xfs_attr_set( - struct xfs_da_args *args) + struct xfs_da_args *args, + enum xfs_attr_update op, + bool rsvd) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_trans_res tres; - bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); int error, local; int rmt_blks = 0; unsigned int total; - if (xfs_is_shutdown(dp->i_mount)) - return -EIO; - - error = xfs_qm_dqattach(dp); - if (error) - return error; - - args->geo = mp->m_attr_geo; - args->whichfork = XFS_ATTR_FORK; - args->hashval = xfs_da_hashname(args->name, args->namelen); + ASSERT(!args->trans); - /* - * We have no control over the attribute names that userspace passes us - * to remove, so we have to allow the name lookup prior to attribute - * removal to fail as well. Preserve the logged flag, since we need - * to pass that through to the logging code. - */ - args->op_flags = XFS_DA_OP_OKNOENT | - (args->op_flags & XFS_DA_OP_LOGGED); - - if (args->value) { + switch (op) { + case XFS_ATTRUPDATE_UPSERT: + case XFS_ATTRUPDATE_CREATE: + case XFS_ATTRUPDATE_REPLACE: XFS_STATS_INC(mp, xs_attr_set); args->total = xfs_attr_calc_size(args, &local); @@ -963,16 +1026,18 @@ xfs_attr_set( xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); - error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + error = xfs_attr_add_fork(dp, sf_size, rsvd); if (error) return error; } if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); - } else { + break; + case XFS_ATTRUPDATE_REMOVE: XFS_STATS_INC(mp, xs_attr_remove); - rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); + rmt_blks = xfs_attr3_max_rmt_blocks(mp); + break; } /* @@ -984,12 +1049,9 @@ xfs_attr_set( if (error) return error; - if (args->value || xfs_inode_hasattr(dp)) { - error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, + if (op != XFS_ATTRUPDATE_REMOVE || xfs_inode_hasattr(dp)) { + error = xfs_iext_count_extend(args->trans, dp, XFS_ATTR_FORK, XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(args->trans, dp, - XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); if (error) goto out_trans_cancel; } @@ -997,26 +1059,26 @@ xfs_attr_set( error = xfs_attr_lookup(args); switch (error) { case -EEXIST: - if (!args->value) { + if (op == XFS_ATTRUPDATE_REMOVE) { /* if no value, we are performing a remove operation */ - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_REMOVE); break; } /* Pure create fails if the attr already exists */ - if (args->attr_flags & XATTR_CREATE) + if (op == XFS_ATTRUPDATE_CREATE) goto out_trans_cancel; - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_REPLACE); break; case -ENOATTR: /* Can't remove what isn't there. */ - if (!args->value) + if (op == XFS_ATTRUPDATE_REMOVE) goto out_trans_cancel; /* Pure replace fails if no existing attr to replace. */ - if (args->attr_flags & XATTR_REPLACE) + if (op == XFS_ATTRUPDATE_REPLACE) goto out_trans_cancel; - xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET); + xfs_attr_defer_add(args, XFS_ATTR_DEFER_SET); break; default: goto out_trans_cancel; @@ -1029,8 +1091,7 @@ xfs_attr_set( if (xfs_has_wsync(mp)) xfs_trans_set_sync(args->trans); - if (!(args->op_flags & XFS_DA_OP_NOTIME)) - xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. @@ -1039,6 +1100,7 @@ xfs_attr_set( error = xfs_trans_commit(args->trans); out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); + args->trans = NULL; return error; out_trans_cancel: @@ -1051,7 +1113,7 @@ out_trans_cancel: * External routines when attribute list is inside the inode *========================================================================*/ -static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) +int xfs_attr_sf_totsize(struct xfs_inode *dp) { struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; @@ -1154,7 +1216,7 @@ xfs_attr_leaf_try_add( struct xfs_buf *bp; int error; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp); if (error) return error; @@ -1202,7 +1264,7 @@ xfs_attr_leaf_hasname( { int error = 0; - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, bp); if (error) return error; @@ -1511,12 +1573,23 @@ out_release: return error; } +/* Enforce that there is at most one namespace bit per attr. */ +inline bool xfs_attr_check_namespace(unsigned int attr_flags) +{ + return hweight32(attr_flags & XFS_ATTR_NSP_ONDISK_MASK) < 2; +} + /* Returns true if the attribute entry name is valid. */ bool xfs_attr_namecheck( + unsigned int attr_flags, const void *name, size_t length) { + /* Only one namespace bit allowed. */ + if (!xfs_attr_check_namespace(attr_flags)) + return false; + /* * MAXNAMELEN includes the trailing null, but (name/length) leave it * out, so use >= for the length check. @@ -1524,6 +1597,10 @@ xfs_attr_namecheck( if (length >= MAXNAMELEN) return false; + /* Parent pointers have their own validation. */ + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_namecheck(attr_flags, name, length); + /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 81be9b3e4004..088cb7b30168 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -47,8 +47,9 @@ struct xfs_attrlist_cursor_kern { /* void; state communicated via *context */ -typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, - unsigned char *, int, int); +typedef void (*put_listent_func_t)(struct xfs_attr_list_context *context, + int flags, unsigned char *name, int namelen, void *value, + int valuelen); struct xfs_attr_list_context { struct xfs_trans *tp; @@ -510,8 +511,8 @@ struct xfs_attr_intent { struct xfs_da_args *xattri_da_args; /* - * Shared buffer containing the attr name and value so that the logging - * code can share large memory buffers between log items. + * Shared buffer containing the attr name, new name, and value so that + * the logging code can share large memory buffers between log items. */ struct xfs_attri_log_nameval *xattri_nameval; @@ -529,6 +530,11 @@ struct xfs_attr_intent { struct xfs_bmbt_irec xattri_map; }; +static inline unsigned int +xfs_attr_intent_op(const struct xfs_attr_intent *attr) +{ + return attr->xattri_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; +} /*======================================================================== * Function prototypes for the kernel. @@ -544,10 +550,20 @@ int xfs_inode_hasattr(struct xfs_inode *ip); bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); -int xfs_attr_set(struct xfs_da_args *args); + +enum xfs_attr_update { + XFS_ATTRUPDATE_REMOVE, /* remove attr */ + XFS_ATTRUPDATE_UPSERT, /* set value, replace any existing attr */ + XFS_ATTRUPDATE_CREATE, /* set value, fail if attr already exists */ + XFS_ATTRUPDATE_REPLACE, /* set value, fail if attr does not exist */ +}; + +int xfs_attr_set(struct xfs_da_args *args, enum xfs_attr_update op, bool rsvd); int xfs_attr_set_iter(struct xfs_attr_intent *attr); int xfs_attr_remove_iter(struct xfs_attr_intent *attr); -bool xfs_attr_namecheck(const void *name, size_t length); +bool xfs_attr_check_namespace(unsigned int attr_flags); +bool xfs_attr_namecheck(unsigned int attr_flags, const void *name, + size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, unsigned int *total); @@ -590,7 +606,6 @@ xfs_attr_init_add_state(struct xfs_da_args *args) static inline enum xfs_delattr_state xfs_attr_init_remove_state(struct xfs_da_args *args) { - args->op_flags |= XFS_DA_OP_REMOVE; if (xfs_attr_is_shortform(args->dp)) return XFS_DAS_SF_REMOVE; if (xfs_attr_is_leaf(args->dp)) @@ -614,8 +629,25 @@ xfs_attr_init_replace_state(struct xfs_da_args *args) return xfs_attr_init_add_state(args); } +xfs_dahash_t xfs_attr_hashname(const uint8_t *name, int namelen); + +xfs_dahash_t xfs_attr_hashval(struct xfs_mount *mp, unsigned int attr_flags, + const uint8_t *name, int namelen, const void *value, + int valuelen); + +/* Set the hash value for any extended attribute from any namespace. */ +static inline void xfs_attr_sethash(struct xfs_da_args *args) +{ + args->hashval = xfs_attr_hashval(args->dp->i_mount, args->attr_filter, + args->name, args->namelen, + args->value, args->valuelen); +} + extern struct kmem_cache *xfs_attr_intent_cache; int __init xfs_attr_intent_init_cache(void); void xfs_attr_intent_destroy_cache(void); +int xfs_attr_sf_totsize(struct xfs_inode *dp); +int xfs_attr_add_fork(struct xfs_inode *ip, int size, int rsvd); + #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index ac904cc1a97b..b9e98950eb3d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -388,6 +388,27 @@ xfs_attr3_leaf_verify( return NULL; } +xfs_failaddr_t +xfs_attr3_leaf_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_attr3_leafblock *hdr3 = bp->b_addr; + + if (hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.info.owner) != owner) + return __this_address; + } + + return NULL; +} + static void xfs_attr3_leaf_write_verify( struct xfs_buf *bp) @@ -448,16 +469,30 @@ int xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_attr3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); - return err; + return 0; } /*======================================================================== @@ -472,28 +507,57 @@ xfs_attr3_leaf_read( * INCOMPLETE flag will not be set in attr->attr_filter, but rather * XFS_DA_OP_RECOVERY will be set in args->op_flags. */ +static inline unsigned int xfs_attr_match_mask(const struct xfs_da_args *args) +{ + if (args->op_flags & XFS_DA_OP_RECOVERY) + return XFS_ATTR_NSP_ONDISK_MASK; + return XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE; +} + +static inline bool +xfs_attr_parent_match( + const struct xfs_da_args *args, + const void *value, + unsigned int valuelen) +{ + ASSERT(args->value != NULL); + + /* Parent pointers do not use remote values */ + if (!value) + return false; + + /* + * The only value we support is a parent rec. However, we'll accept + * any valuelen so that offline repair can delete ATTR_PARENT values + * that are not parent pointers. + */ + if (valuelen != args->valuelen) + return false; + + return memcmp(args->value, value, valuelen) == 0; +} + static bool xfs_attr_match( struct xfs_da_args *args, - uint8_t namelen, - unsigned char *name, - int flags) + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen) { + unsigned int mask = xfs_attr_match_mask(args); if (args->namelen != namelen) return false; + if ((args->attr_filter & mask) != (attr_flags & mask)) + return false; if (memcmp(args->name, name, namelen) != 0) return false; - /* Recovery ignores the INCOMPLETE flag. */ - if ((args->op_flags & XFS_DA_OP_RECOVERY) && - args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK)) - return true; + if (attr_flags & XFS_ATTR_PARENT) + return xfs_attr_parent_match(args, value, valuelen); - /* All remaining matches need to be filtered by INCOMPLETE state. */ - if (args->attr_filter != - (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) - return false; return true; } @@ -504,6 +568,13 @@ xfs_attr_copy_value( int valuelen) { /* + * Parent pointer lookups require the caller to specify the name and + * value, so don't copy anything. + */ + if (args->attr_filter & XFS_ATTR_PARENT) + return 0; + + /* * No copy if all we have to do is get the length */ if (!args->valuelen) { @@ -711,8 +782,9 @@ xfs_attr_sf_findname( for (sfe = xfs_attr_sf_firstentry(sf); sfe < xfs_attr_sf_endptr(sf); sfe = xfs_attr_sf_nextentry(sfe)) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) + if (xfs_attr_match(args, sfe->flags, sfe->nameval, + sfe->namelen, &sfe->nameval[sfe->namelen], + sfe->valuelen)) return sfe; } @@ -819,7 +891,8 @@ xfs_attr_sf_removename( */ if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && - !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { + !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && + !xfs_has_parent(mp)) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); @@ -828,7 +901,8 @@ xfs_attr_sf_removename( ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || !xfs_has_attr2(mp) || - dp->i_df.if_format == XFS_DINODE_FMT_BTREE); + dp->i_df.if_format == XFS_DINODE_FMT_BTREE || + xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -904,6 +978,7 @@ xfs_attr_shortform_to_leaf( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; + nargs.owner = args->owner; sfe = xfs_attr_sf_firstentry(sf); for (i = 0; i < sf->count; i++) { @@ -911,9 +986,13 @@ xfs_attr_shortform_to_leaf( nargs.namelen = sfe->namelen; nargs.value = &sfe->nameval[nargs.namelen]; nargs.valuelen = sfe->valuelen; - nargs.hashval = xfs_da_hashname(sfe->nameval, - sfe->namelen); nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; + if (!xfs_attr_check_namespace(sfe->flags)) { + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto out; + } + xfs_attr_sethash(&nargs); error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); error = xfs_attr3_leaf_add(bp, &nargs); @@ -1027,7 +1106,7 @@ xfs_attr_shortform_verify( * one namespace flag per xattr, so we can just count the * bits (i.e. hweight) here. */ - if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) + if (!xfs_attr_check_namespace(sfep->flags)) return __this_address; sfep = next_sfep; @@ -1106,6 +1185,7 @@ xfs_attr3_leaf_to_shortform( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; + nargs.owner = args->owner; for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) @@ -1158,7 +1238,7 @@ xfs_attr3_leaf_to_node( error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1); + error = xfs_attr3_leaf_read(args->trans, dp, args->owner, 0, &bp1); if (error) goto out; @@ -1237,7 +1317,7 @@ xfs_attr3_leaf_create( ichdr.magic = XFS_ATTR3_LEAF_MAGIC; hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); @@ -1993,7 +2073,7 @@ xfs_attr3_leaf_toosmall( if (blkno == 0) continue; error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, - blkno, &bp); + state->args->owner, blkno, &bp); if (error) return error; @@ -2401,18 +2481,23 @@ xfs_attr3_leaf_lookup_int( */ if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); - if (!xfs_attr_match(args, name_loc->namelen, - name_loc->nameval, entry->flags)) + if (!xfs_attr_match(args, entry->flags, + name_loc->nameval, name_loc->namelen, + &name_loc->nameval[name_loc->namelen], + be16_to_cpu(name_loc->valuelen))) continue; args->index = probe; return -EEXIST; } else { + unsigned int valuelen; + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); - if (!xfs_attr_match(args, name_rmt->namelen, - name_rmt->name, entry->flags)) + valuelen = be32_to_cpu(name_rmt->valuelen); + if (!xfs_attr_match(args, entry->flags, name_rmt->name, + name_rmt->namelen, NULL, valuelen)) continue; args->index = probe; - args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = valuelen; args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, @@ -2715,7 +2800,8 @@ xfs_attr3_leaf_clearflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -2779,7 +2865,8 @@ xfs_attr3_leaf_setflag( /* * Set up the operation. */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp); if (error) return error; @@ -2838,7 +2925,8 @@ xfs_attr3_leaf_flipflags( /* * Read the block containing the "old" attr */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno, &bp1); if (error) return error; @@ -2846,8 +2934,8 @@ xfs_attr3_leaf_flipflags( * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, - &bp2); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, + args->blkno2, &bp2); if (error) return error; } else { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 9b9948639c0f..bac219589896 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -98,12 +98,14 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp); void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from); +xfs_failaddr_t xfs_attr3_leaf_header_check(struct xfs_buf *bp, + xfs_ino_t owner); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index ff0412828772..4c44ce1c8a64 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -43,19 +43,32 @@ * the logging system and therefore never have a log item. */ -/* - * Each contiguous block has a header, so it is not just a simple attribute - * length to FSB conversion. - */ -int +/* How many bytes can be stored in a remote value buffer? */ +inline unsigned int +xfs_attr3_rmt_buf_space( + struct xfs_mount *mp) +{ + unsigned int blocksize = mp->m_attr_geo->blksize; + + if (xfs_has_crc(mp)) + return blocksize - sizeof(struct xfs_attr3_rmt_hdr); + + return blocksize; +} + +/* Compute number of fsblocks needed to store a remote attr value */ +unsigned int xfs_attr3_rmt_blocks( - struct xfs_mount *mp, - int attrlen) + struct xfs_mount *mp, + unsigned int attrlen) { - if (xfs_has_crc(mp)) { - int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - return (attrlen + buflen - 1) / buflen; - } + /* + * Each contiguous block has a header, so it is not just a simple + * attribute length to FSB conversion. + */ + if (xfs_has_crc(mp)) + return howmany(attrlen, xfs_attr3_rmt_buf_space(mp)); + return XFS_B_TO_FSB(mp, attrlen); } @@ -92,7 +105,6 @@ xfs_attr3_rmt_verify( struct xfs_mount *mp, struct xfs_buf *bp, void *ptr, - int fsbsize, xfs_daddr_t bno) { struct xfs_attr3_rmt_hdr *rmt = ptr; @@ -103,7 +115,7 @@ xfs_attr3_rmt_verify( return __this_address; if (be64_to_cpu(rmt->rm_blkno) != bno) return __this_address; - if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) + if (be32_to_cpu(rmt->rm_bytes) > mp->m_attr_geo->blksize - sizeof(*rmt)) return __this_address; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX) @@ -122,9 +134,9 @@ __xfs_attr3_rmt_read_verify( { struct xfs_mount *mp = bp->b_mount; char *ptr; - int len; + unsigned int len; xfs_daddr_t bno; - int blksize = mp->m_attr_geo->blksize; + unsigned int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) @@ -141,7 +153,7 @@ __xfs_attr3_rmt_read_verify( *failaddr = __this_address; return -EFSBADCRC; } - *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); + *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, bno); if (*failaddr) return -EFSCORRUPTED; len -= blksize; @@ -186,7 +198,7 @@ xfs_attr3_rmt_write_verify( { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - int blksize = mp->m_attr_geo->blksize; + unsigned int blksize = mp->m_attr_geo->blksize; char *ptr; int len; xfs_daddr_t bno; @@ -203,7 +215,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); + fa = xfs_attr3_rmt_verify(mp, bp, ptr, bno); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -280,30 +292,30 @@ xfs_attr_rmtval_copyout( struct xfs_mount *mp, struct xfs_buf *bp, struct xfs_inode *dp, - int *offset, - int *valuelen, + xfs_ino_t owner, + unsigned int *offset, + unsigned int *valuelen, uint8_t **dst) { char *src = bp->b_addr; - xfs_ino_t ino = dp->i_ino; xfs_daddr_t bno = xfs_buf_daddr(bp); - int len = BBTOB(bp->b_length); - int blksize = mp->m_attr_geo->blksize; + unsigned int len = BBTOB(bp->b_length); + unsigned int blksize = mp->m_attr_geo->blksize; ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { - int hdr_size = 0; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + unsigned int hdr_size = 0; + unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp); byte_cnt = min(*valuelen, byte_cnt); if (xfs_has_crc(mp)) { - if (xfs_attr3_rmt_hdr_ok(src, ino, *offset, + if (xfs_attr3_rmt_hdr_ok(src, owner, *offset, byte_cnt, bno)) { xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", - bno, *offset, byte_cnt, ino); + bno, *offset, byte_cnt, owner); xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -330,20 +342,20 @@ xfs_attr_rmtval_copyin( struct xfs_mount *mp, struct xfs_buf *bp, xfs_ino_t ino, - int *offset, - int *valuelen, + unsigned int *offset, + unsigned int *valuelen, uint8_t **src) { char *dst = bp->b_addr; xfs_daddr_t bno = xfs_buf_daddr(bp); - int len = BBTOB(bp->b_length); - int blksize = mp->m_attr_geo->blksize; + unsigned int len = BBTOB(bp->b_length); + unsigned int blksize = mp->m_attr_geo->blksize; ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { - int hdr_size; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + unsigned int hdr_size; + unsigned int byte_cnt = xfs_attr3_rmt_buf_space(mp); byte_cnt = min(*valuelen, byte_cnt); hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, @@ -389,12 +401,12 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; uint8_t *dst = args->value; - int valuelen; + unsigned int valuelen; int nmap; int error; - int blkcnt = args->rmtblkcnt; + unsigned int blkcnt = args->rmtblkcnt; int i; - int offset = 0; + unsigned int offset = 0; trace_xfs_attr_rmtval_get(args); @@ -427,8 +439,7 @@ xfs_attr_rmtval_get( return error; error = xfs_attr_rmtval_copyout(mp, bp, args->dp, - &offset, &valuelen, - &dst); + args->owner, &offset, &valuelen, &dst); xfs_buf_relse(bp); if (error) return error; @@ -453,7 +464,7 @@ xfs_attr_rmt_find_hole( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; int error; - int blkcnt; + unsigned int blkcnt; xfs_fileoff_t lfileoff = 0; /* @@ -482,11 +493,11 @@ xfs_attr_rmtval_set_value( struct xfs_bmbt_irec map; xfs_dablk_t lblkno; uint8_t *src = args->value; - int blkcnt; - int valuelen; + unsigned int blkcnt; + unsigned int valuelen; int nmap; int error; - int offset = 0; + unsigned int offset = 0; /* * Roll through the "value", copying the attribute value to the @@ -522,8 +533,8 @@ xfs_attr_rmtval_set_value( return error; bp->b_ops = &xfs_attr3_rmt_buf_ops; - xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, - &valuelen, &src); + xfs_attr_rmtval_copyin(mp, bp, args->owner, &offset, &valuelen, + &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); @@ -626,7 +637,6 @@ xfs_attr_rmtval_set_blk( if (error) return error; - ASSERT(nmap == 1); ASSERT((map->br_startblock != DELAYSTARTBLOCK) && (map->br_startblock != HOLESTARTBLOCK)); @@ -646,7 +656,7 @@ xfs_attr_rmtval_invalidate( struct xfs_da_args *args) { xfs_dablk_t lblkno; - int blkcnt; + unsigned int blkcnt; int error; /* diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index d097ec6c4dc3..e3c6c7d774bf 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -6,7 +6,13 @@ #ifndef __XFS_ATTR_REMOTE_H__ #define __XFS_ATTR_REMOTE_H__ -int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); +unsigned int xfs_attr3_rmt_blocks(struct xfs_mount *mp, unsigned int attrlen); + +/* Number of rmt blocks needed to store the maximally sized attr value */ +static inline unsigned int xfs_attr3_max_rmt_blocks(struct xfs_mount *mp) +{ + return xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); +} int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index bc4422223024..73bdc0e55682 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -16,6 +16,7 @@ typedef struct xfs_attr_sf_sort { uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ xfs_dahash_t hash; /* this entry's hash value */ unsigned char *name; /* name value, pointer into buffer */ + void *value; } xfs_attr_sf_sort_t; #define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 656c95a22f2e..3b3206d312d6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -779,7 +779,7 @@ xfs_bmap_local_to_extents_empty( } -STATIC int /* error */ +int /* error */ xfs_bmap_local_to_extents( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ @@ -789,7 +789,8 @@ xfs_bmap_local_to_extents( void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, - struct xfs_ifork *ifp)) + struct xfs_ifork *ifp, void *priv), + void *priv) { int error = 0; int flags; /* logging flags returned */ @@ -850,7 +851,7 @@ xfs_bmap_local_to_extents( * log here. Note that init_fn must also set the buffer log item type * correctly. */ - init_fn(tp, bp, ip, ifp); + init_fn(tp, bp, ip, ifp, priv); /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); @@ -976,13 +977,14 @@ xfs_bmap_add_attrfork_local( dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; + dargs.owner = ip->i_ino; return xfs_dir2_sf_to_block(&dargs); } if (S_ISLNK(VFS_I(ip)->i_mode)) return xfs_bmap_local_to_extents(tp, ip, 1, flags, - XFS_DATA_FORK, - xfs_symlink_local_to_remote); + XFS_DATA_FORK, xfs_symlink_local_to_remote, + NULL); /* should only be called for types that support local format data */ ASSERT(0); @@ -1023,40 +1025,29 @@ xfs_bmap_set_attrforkoff( } /* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. + * Convert inode from non-attributed to attributed. Caller must hold the + * ILOCK_EXCL and the file cannot have an attr fork. */ int /* error code */ xfs_bmap_add_attrfork( - xfs_inode_t *ip, /* incore inode pointer */ + struct xfs_trans *tp, + struct xfs_inode *ip, /* incore inode pointer */ int size, /* space new attribute needs */ int rsvd) /* xact may use reserved blks */ { - xfs_mount_t *mp; /* mount structure */ - xfs_trans_t *tp; /* transaction pointer */ - int blks; /* space reservation */ + struct xfs_mount *mp = tp->t_mountp; int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ - ASSERT(xfs_inode_has_attr_fork(ip) == 0); - - mp = ip->i_mount; + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - - blks = XFS_ADDAFORK_SPACE_RES(mp); - - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, - rsvd, &tp); - if (error) - return error; - if (xfs_inode_has_attr_fork(ip)) - goto trans_cancel; + ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) - goto trans_cancel; + return error; xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; @@ -1077,7 +1068,7 @@ xfs_bmap_add_attrfork( if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (error) - goto trans_cancel; + return error; if (!xfs_has_attr(mp) || (!xfs_has_attr2(mp) && version == 2)) { bool log_sb = false; @@ -1096,14 +1087,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; - -trans_cancel: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; + return 0; } /* @@ -1586,6 +1570,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -1616,6 +1601,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1650,6 +1636,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -1684,6 +1671,7 @@ xfs_bmap_add_extent_delay_real( goto done; } } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -1722,6 +1710,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING: @@ -1812,6 +1801,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); xfs_iext_next(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT); + ASSERT(da_new <= da_old); break; case BMAP_RIGHT_FILLING: @@ -1861,6 +1851,7 @@ xfs_bmap_add_extent_delay_real( PREV.br_blockcount = temp; xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); xfs_iext_next(ifp, &bma->icur); + ASSERT(da_new <= da_old); break; case 0: @@ -1975,7 +1966,7 @@ xfs_bmap_add_extent_delay_real( } if (da_new != da_old) - xfs_mod_delalloc(mp, (int64_t)da_new - da_old); + xfs_mod_delalloc(bma->ip, 0, (int64_t)da_new - da_old); if (bma->cur) { da_new += bma->cur->bc_bmap.allocated; @@ -1983,11 +1974,10 @@ xfs_bmap_add_extent_delay_real( } /* adjust for changes in reserved delayed indirect blocks */ - if (da_new != da_old) { - ASSERT(state == 0 || da_new < da_old); - error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), - false); - } + if (da_new < da_old) + xfs_add_fdblocks(mp, da_old - da_new); + else if (da_new > da_old) + error = xfs_dec_fdblocks(mp, da_new - da_old, true); xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: @@ -2688,12 +2678,12 @@ xfs_bmap_add_extent_hole_delay( } if (oldlen != newlen) { ASSERT(oldlen > newlen); - xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen), - false); + xfs_add_fdblocks(ip->i_mount, oldlen - newlen); + /* * Nothing to do for disk quota accounting here. */ - xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen); + xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); } } @@ -3370,7 +3360,7 @@ xfs_bmap_alloc_account( * yet. */ if (ap->wasdel) { - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0); return; } @@ -3394,7 +3384,7 @@ xfs_bmap_alloc_account( xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { ap->ip->i_delayed_blks -= ap->length; - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0); fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT; } else { fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; @@ -4066,6 +4056,7 @@ xfs_bmapi_reserve_delalloc( struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; + uint64_t fdblocks; int error; xfs_fileoff_t aoff = off; @@ -4108,17 +4099,21 @@ xfs_bmapi_reserve_delalloc( indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); ASSERT(indlen > 0); - error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); - if (error) - goto out_unreserve_quota; + fdblocks = indlen; + if (XFS_IS_REALTIME_INODE(ip)) { + error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, alen)); + if (error) + goto out_unreserve_quota; + } else { + fdblocks += alen; + } - error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false); + error = xfs_dec_fdblocks(mp, fdblocks, false); if (error) - goto out_unreserve_blocks; - + goto out_unreserve_frextents; ip->i_delayed_blks += alen; - xfs_mod_delalloc(ip->i_mount, alen + indlen); + xfs_mod_delalloc(ip, alen, indlen); got->br_startoff = aoff; got->br_startblock = nullstartblock(indlen); @@ -4139,8 +4134,9 @@ xfs_bmapi_reserve_delalloc( return 0; -out_unreserve_blocks: - xfs_mod_fdblocks(mp, alen, false); +out_unreserve_frextents: + if (XFS_IS_REALTIME_INODE(ip)) + xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, alen)); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_quota_unreserve_blkres(ip, alen); @@ -4191,26 +4187,10 @@ xfs_bmapi_allocate( struct xfs_mount *mp = bma->ip->i_mount; int whichfork = xfs_bmapi_whichfork(bma->flags); struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); - int tmp_logflags = 0; int error; ASSERT(bma->length > 0); - - /* - * For the wasdelay case, we could also just allocate the stuff asked - * for in this bmap call but that wouldn't be as good. - */ - if (bma->wasdel) { - bma->length = (xfs_extlen_t)bma->got.br_blockcount; - bma->offset = bma->got.br_startoff; - if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) - bma->prev.br_startoff = NULLFILEOFF; - } else { - bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN); - if (!bma->eof) - bma->length = XFS_FILBLKS_MIN(bma->length, - bma->got.br_startoff - bma->offset); - } + ASSERT(bma->length <= XFS_MAX_BMBT_EXTLEN); if (bma->flags & XFS_BMAPI_CONTIG) bma->minlen = bma->length; @@ -4226,8 +4206,15 @@ xfs_bmapi_allocate( } else { error = xfs_bmap_alloc_userdata(bma); } - if (error || bma->blkno == NULLFSBLOCK) + if (error) return error; + if (bma->blkno == NULLFSBLOCK) + return -ENOSPC; + + if (WARN_ON_ONCE(!xfs_valid_startblock(bma->ip, bma->blkno))) { + xfs_bmap_mark_sick(bma->ip, whichfork); + return -EFSCORRUPTED; + } if (bma->flags & XFS_BMAPI_ZERO) { error = xfs_zero_extent(bma->ip, bma->blkno, bma->length); @@ -4260,8 +4247,6 @@ xfs_bmapi_allocate( error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, whichfork, &bma->icur, &bma->cur, &bma->got, &bma->logflags, bma->flags); - - bma->logflags |= tmp_logflags; if (error) return error; @@ -4406,6 +4391,15 @@ xfs_bmapi_finish( * extent state if necessary. Details behaviour is controlled by the flags * parameter. Only allocates blocks from a single allocation group, to avoid * locking problems. + * + * Returns 0 on success and places the extent mappings in mval. nmaps is used + * as an input/output parameter where the caller specifies the maximum number + * of mappings that may be returned and xfs_bmapi_write passes back the number + * of mappings (including existing mappings) it found. + * + * Returns a negative error code on failure, including -ENOSPC when it could not + * allocate any blocks and -ENOSR when it did allocate blocks to convert a + * delalloc range, but those blocks were before the passed in range. */ int xfs_bmapi_write( @@ -4524,20 +4518,33 @@ xfs_bmapi_write( * allocation length request (which can be 64 bits in * length) and the bma length request, which is * xfs_extlen_t and therefore 32 bits. Hence we have to - * check for 32-bit overflows and handle them here. + * be careful and do the min() using the larger type to + * avoid overflows. */ - if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN) - bma.length = XFS_MAX_BMBT_EXTLEN; - else - bma.length = len; + bma.length = XFS_FILBLKS_MIN(len, XFS_MAX_BMBT_EXTLEN); + + if (wasdelay) { + bma.length = XFS_FILBLKS_MIN(bma.length, + bma.got.br_blockcount - + (bno - bma.got.br_startoff)); + } else { + if (!eof) + bma.length = XFS_FILBLKS_MIN(bma.length, + bma.got.br_startoff - bno); + } - ASSERT(len > 0); ASSERT(bma.length > 0); error = xfs_bmapi_allocate(&bma); - if (error) + if (error) { + /* + * If we already allocated space in a previous + * iteration return what we go so far when + * running out of space. + */ + if (error == -ENOSPC && bma.nallocs) + break; goto error0; - if (bma.blkno == NULLFSBLOCK) - break; + } /* * If this is a CoW allocation, record the data in @@ -4575,7 +4582,6 @@ xfs_bmapi_write( if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got)) eof = true; } - *nmap = n; error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -4586,7 +4592,22 @@ xfs_bmapi_write( ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + orig_nmap, n); + + /* + * When converting delayed allocations, xfs_bmapi_allocate ignores + * the passed in bno and always converts from the start of the found + * delalloc extent. + * + * To avoid a successful return with *nmap set to 0, return the magic + * -ENOSR error code for this particular case so that the caller can + * handle it. + */ + if (!n) { + ASSERT(bma.nallocs >= *nmap); + return -ENOSR; + } + *nmap = n; return 0; error0: xfs_bmapi_finish(&bma, whichfork, error); @@ -4599,8 +4620,8 @@ error0: * invocations to allocate the target offset if a large enough physical extent * is not available. */ -int -xfs_bmapi_convert_delalloc( +static int +xfs_bmapi_convert_one_delalloc( struct xfs_inode *ip, int whichfork, xfs_off_t offset, @@ -4630,11 +4651,8 @@ xfs_bmapi_convert_delalloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_iext_count_may_overflow(ip, whichfork, + error = xfs_iext_count_extend(tp, ip, whichfork, XFS_IEXT_ADD_NOSPLIT_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; @@ -4657,19 +4675,25 @@ xfs_bmapi_convert_delalloc( if (!isnullstartblock(bma.got.br_startblock)) { xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } bma.tp = tp; bma.ip = ip; bma.wasdel = true; - bma.offset = bma.got.br_startoff; - bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, - XFS_MAX_BMBT_EXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* + * Always allocate convert from the start of the delalloc extent even if + * that is outside the passed in range to create large contiguous + * extents on disk. + */ + bma.offset = bma.got.br_startoff; + bma.length = bma.got.br_blockcount; + + /* * When we're converting the delalloc reservations backing dirty pages * in the page cache, we must be careful about how we create the new * extents: @@ -4693,22 +4717,14 @@ xfs_bmapi_convert_delalloc( if (error) goto out_finish; - error = -ENOSPC; - if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) - goto out_finish; - if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) { - xfs_bmap_mark_sick(ip, whichfork); - error = -EFSCORRUPTED; - goto out_finish; - } - XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); @@ -4731,6 +4747,36 @@ out_trans_cancel: return error; } +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in iomap. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + loff_t offset, + struct iomap *iomap, + unsigned int *seq) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into iomap. Allocate in a loop because it may + * take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset, + iomap, seq); + if (error) + return error; + } while (iomap->offset + iomap->length <= offset); + + return 0; +} + int xfs_bmapi_remap( struct xfs_trans *tp, @@ -4822,32 +4868,18 @@ error0: * ores == 1). The number of stolen blocks is returned. The availability and * subsequent accounting of stolen blocks is the responsibility of the caller. */ -static xfs_filblks_t +static void xfs_bmap_split_indlen( xfs_filblks_t ores, /* original res. */ xfs_filblks_t *indlen1, /* ext1 worst indlen */ - xfs_filblks_t *indlen2, /* ext2 worst indlen */ - xfs_filblks_t avail) /* stealable blocks */ + xfs_filblks_t *indlen2) /* ext2 worst indlen */ { xfs_filblks_t len1 = *indlen1; xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ - xfs_filblks_t stolen = 0; xfs_filblks_t resfactor; /* - * Steal as many blocks as we can to try and satisfy the worst case - * indlen for both new extents. - */ - if (ores < nres && avail) - stolen = XFS_FILBLKS_MIN(nres - ores, avail); - ores += stolen; - - /* nothing else to do if we've satisfied the new reservation */ - if (ores >= nres) - return stolen; - - /* * We can't meet the total required reservation for the two extents. * Calculate the percent of the overall shortage between both extents * and apply this percentage to each of the requested indlen values. @@ -4891,11 +4923,9 @@ xfs_bmap_split_indlen( *indlen1 = len1; *indlen2 = len2; - - return stolen; } -int +void xfs_bmap_del_extent_delay( struct xfs_inode *ip, int whichfork, @@ -4908,9 +4938,9 @@ xfs_bmap_del_extent_delay( struct xfs_bmbt_irec new; int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; - xfs_filblks_t got_indlen, new_indlen, stolen; + xfs_filblks_t got_indlen, new_indlen, stolen = 0; uint32_t state = xfs_bmap_fork_to_state(whichfork); - int error = 0; + uint64_t fdblocks; bool isrt; XFS_STATS_INC(mp, xs_del_exlist); @@ -4925,18 +4955,12 @@ xfs_bmap_del_extent_delay( ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); - if (isrt) - xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); - /* * Update the inode delalloc counter now and wait to update the * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - ASSERT(!isrt); - error = xfs_quota_unreserve_blkres(ip, del->br_blockcount); - if (error) - return error; + xfs_quota_unreserve_blkres(ip, del->br_blockcount); ip->i_delayed_blks -= del->br_blockcount; if (got->br_startoff == del->br_startoff) @@ -4990,8 +5014,24 @@ xfs_bmap_del_extent_delay( new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount); WARN_ON_ONCE(!got_indlen || !new_indlen); - stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen, - del->br_blockcount); + /* + * Steal as many blocks as we can to try and satisfy the worst + * case indlen for both new extents. + * + * However, we can't just steal reservations from the data + * blocks if this is an RT inodes as the data and metadata + * blocks come from different pools. We'll have to live with + * under-filled indirect reservation in this case. + */ + da_new = got_indlen + new_indlen; + if (da_new > da_old && !isrt) { + stolen = XFS_FILBLKS_MIN(da_new - da_old, + del->br_blockcount); + da_old += stolen; + } + if (da_new > da_old) + xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen); + da_new = got_indlen + new_indlen; got->br_startblock = nullstartblock((int)got_indlen); @@ -5003,20 +5043,21 @@ xfs_bmap_del_extent_delay( xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); - da_new = got_indlen + new_indlen - stolen; del->br_blockcount -= stolen; break; } ASSERT(da_old >= da_new); da_diff = da_old - da_new; - if (!isrt) - da_diff += del->br_blockcount; - if (da_diff) { - xfs_mod_fdblocks(mp, da_diff, false); - xfs_mod_delalloc(mp, -da_diff); - } - return error; + fdblocks = da_diff; + + if (isrt) + xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); + else + fdblocks += del->br_blockcount; + + xfs_add_fdblocks(mp, fdblocks); + xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); } void @@ -5107,8 +5148,7 @@ xfs_bmap_del_extent_real( { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ - int do_fx; /* free extent at end of routine */ - int error; /* error return value */ + int error = 0; /* error return value */ struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -5151,20 +5191,10 @@ xfs_bmap_del_extent_real( return -ENOSPC; *logflagsp = XFS_ILOG_CORE; - if (xfs_ifork_is_realtime(ip, whichfork)) { - if (!(bflags & XFS_BMAPI_REMAP)) { - error = xfs_rtfree_blocks(tp, del->br_startblock, - del->br_blockcount); - if (error) - return error; - } - - do_fx = 0; + if (xfs_ifork_is_realtime(ip, whichfork)) qfield = XFS_TRANS_DQ_RTBCOUNT; - } else { - do_fx = 1; + else qfield = XFS_TRANS_DQ_BCOUNT; - } nblks = del->br_blockcount; del_endblock = del->br_startblock + del->br_blockcount; @@ -5312,18 +5342,29 @@ xfs_bmap_del_extent_real( /* * If we need to, add to list of extents to delete. */ - if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { + if (!(bflags & XFS_BMAPI_REMAP)) { if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); + } else if (xfs_ifork_is_realtime(ip, whichfork)) { + /* + * Ensure the bitmap and summary inodes are locked + * and joined to the transaction before modifying them. + */ + if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) { + tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED; + xfs_rtbitmap_lock(tp, mp); + } + error = xfs_rtfree_blocks(tp, del->br_startblock, + del->br_blockcount); } else { error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, ((bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN)); - if (error) - return error; } + if (error) + return error; } /* @@ -5414,16 +5455,6 @@ __xfs_bunmapi( } else cur = NULL; - if (isrt) { - /* - * Synchronize by locking the bitmap inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); - } - extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && (nexts == 0 || extno < nexts)) { @@ -5584,18 +5615,16 @@ __xfs_bunmapi( delete: if (wasdel) { - error = xfs_bmap_del_extent_delay(ip, whichfork, &icur, - &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); } else { error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, &del, &tmp_logflags, whichfork, flags); logflags |= tmp_logflags; + if (error) + goto error0; } - if (error) - goto error0; - end = del.br_startoff - 1; nodelete: /* diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f7662595309d..667b0c2b33d1 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -158,7 +158,7 @@ static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec) * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ -static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_written_extent(const struct xfs_bmbt_irec *irec) { return xfs_bmap_is_real_extent(irec) && irec->br_state != XFS_EXT_UNWRITTEN; @@ -176,9 +176,16 @@ int xfs_bmap_longest_free_extent(struct xfs_perag *pag, void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); -int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +int xfs_bmap_add_attrfork(struct xfs_trans *tp, struct xfs_inode *ip, + int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); +int xfs_bmap_local_to_extents(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t total, int *logflagsp, int whichfork, + void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp, + void *priv), + void *priv); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -195,7 +202,7 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); -int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, +void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 718d071bb21a..16a529a88780 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -252,6 +252,51 @@ xfs_da3_node_verify( return NULL; } +xfs_failaddr_t +xfs_da3_node_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; + + if (hdr3->hdr.magic != cpu_to_be16(XFS_DA3_NODE_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->owner) != owner) + return __this_address; + } + + return NULL; +} + +xfs_failaddr_t +xfs_da3_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_da_blkinfo *hdr = bp->b_addr; + + if (!xfs_has_crc(mp)) + return NULL; + + switch (hdr->magic) { + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + return xfs_attr3_leaf_header_check(bp, owner); + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + return xfs_da3_node_header_check(bp, owner); + case cpu_to_be16(XFS_DIR3_LEAF1_MAGIC): + case cpu_to_be16(XFS_DIR3_LEAFN_MAGIC): + return xfs_dir3_leaf_header_check(bp, owner); + } + + ASSERT(0); + return NULL; +} + static void xfs_da3_node_write_verify( struct xfs_buf *bp) @@ -486,7 +531,7 @@ xfs_da3_node_create( memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr)); ichdr.magic = XFS_DA3_NODE_MAGIC; hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + hdr3->info.owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { ichdr.magic = XFS_DA_NODE_MAGIC; @@ -1199,6 +1244,7 @@ xfs_da3_root_join( struct xfs_da3_icnode_hdr oldroothdr; int error; struct xfs_inode *dp = state->args->dp; + xfs_failaddr_t fa; trace_xfs_da_root_join(state->args); @@ -1225,6 +1271,13 @@ xfs_da3_root_join( error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* @@ -1259,6 +1312,7 @@ xfs_da3_node_toosmall( struct xfs_da_blkinfo *info; xfs_dablk_t blkno; struct xfs_buf *bp; + xfs_failaddr_t fa; struct xfs_da3_icnode_hdr nodehdr; int count; int forward; @@ -1333,6 +1387,13 @@ xfs_da3_node_toosmall( state->args->whichfork); if (error) return error; + fa = xfs_da3_node_header_check(bp, state->args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(state->args->trans, bp); + xfs_da_mark_sick(state->args); + return -EFSCORRUPTED; + } node = bp->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node); @@ -1591,6 +1652,7 @@ xfs_da3_node_lookup_int( struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; struct xfs_da_args *args; + xfs_failaddr_t fa; xfs_dablk_t blkno; xfs_dahash_t hashval; xfs_dahash_t btreehashval; @@ -1629,6 +1691,12 @@ xfs_da3_node_lookup_int( if (magic == XFS_ATTR_LEAF_MAGIC || magic == XFS_ATTR3_LEAF_MAGIC) { + fa = xfs_attr3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_ATTR_LEAF_MAGIC; blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); break; @@ -1636,6 +1704,12 @@ xfs_da3_node_lookup_int( if (magic == XFS_DIR2_LEAFN_MAGIC || magic == XFS_DIR3_LEAFN_MAGIC) { + fa = xfs_dir3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DIR2_LEAFN_MAGIC; blk->hashval = xfs_dir2_leaf_lasthash(args->dp, blk->bp, NULL); @@ -1648,6 +1722,13 @@ xfs_da3_node_lookup_int( return -EFSCORRUPTED; } + fa = xfs_da3_node_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } + blk->magic = XFS_DA_NODE_MAGIC; /* @@ -1820,6 +1901,7 @@ xfs_da3_blk_link( struct xfs_da_blkinfo *tmp_info; struct xfs_da_args *args; struct xfs_buf *bp; + xfs_failaddr_t fa; int before = 0; int error; struct xfs_inode *dp = state->args->dp; @@ -1863,6 +1945,13 @@ xfs_da3_blk_link( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); @@ -1884,6 +1973,13 @@ xfs_da3_blk_link( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); @@ -1913,6 +2009,7 @@ xfs_da3_blk_unlink( struct xfs_da_blkinfo *tmp_info; struct xfs_da_args *args; struct xfs_buf *bp; + xfs_failaddr_t fa; int error; /* @@ -1943,6 +2040,13 @@ xfs_da3_blk_unlink( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); @@ -1960,6 +2064,13 @@ xfs_da3_blk_unlink( &bp, args->whichfork); if (error) return error; + fa = xfs_da3_header_check(bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_trans_brelse(args->trans, bp); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } ASSERT(bp != NULL); tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); @@ -1996,6 +2107,7 @@ xfs_da3_path_shift( struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; struct xfs_buf *bp; + xfs_failaddr_t fa; xfs_dablk_t blkno = 0; int level; int error; @@ -2074,6 +2186,12 @@ xfs_da3_path_shift( switch (be16_to_cpu(info->magic)) { case XFS_DA_NODE_MAGIC: case XFS_DA3_NODE_MAGIC: + fa = xfs_da3_node_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DA_NODE_MAGIC; xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, bp->b_addr); @@ -2087,6 +2205,12 @@ xfs_da3_path_shift( break; case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: + fa = xfs_attr3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_ATTR_LEAF_MAGIC; ASSERT(level == path->active-1); blk->index = 0; @@ -2094,6 +2218,12 @@ xfs_da3_path_shift( break; case XFS_DIR2_LEAFN_MAGIC: case XFS_DIR3_LEAFN_MAGIC: + fa = xfs_dir3_leaf_header_check(blk->bp, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(blk->bp, fa); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } blk->magic = XFS_DIR2_LEAFN_MAGIC; ASSERT(level == path->active-1); blk->index = 0; @@ -2167,8 +2297,8 @@ xfs_da_grow_inode_int( struct xfs_inode *dp = args->dp; int w = args->whichfork; xfs_rfsblock_t nblks = dp->i_nblocks; - struct xfs_bmbt_irec map, *mapp; - int nmap, error, got, i, mapi; + struct xfs_bmbt_irec map, *mapp = ↦ + int nmap, error, got, i, mapi = 1; /* * Find a spot in the file space to put the new block. @@ -2184,14 +2314,7 @@ xfs_da_grow_inode_int( error = xfs_bmapi_write(tp, dp, *bno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->total, &map, &nmap); - if (error) - return error; - - ASSERT(nmap <= 1); - if (nmap == 1) { - mapp = ↦ - mapi = 1; - } else if (nmap == 0 && count > 1) { + if (error == -ENOSPC && count > 1) { xfs_fileoff_t b; int c; @@ -2209,16 +2332,13 @@ xfs_da_grow_inode_int( args->total, &mapp[mapi], &nmap); if (error) goto out_free_map; - if (nmap < 1) - break; mapi += nmap; b = mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount; } - } else { - mapi = 0; - mapp = NULL; } + if (error) + goto out_free_map; /* * Count the blocks we got, make sure it matches the total. @@ -2290,6 +2410,7 @@ xfs_da3_swap_lastblock( struct xfs_buf *last_buf; struct xfs_buf *sib_buf; struct xfs_buf *par_buf; + xfs_failaddr_t fa; xfs_dahash_t dead_hash; xfs_fileoff_t lastoff; xfs_dablk_t dead_blkno; @@ -2326,6 +2447,14 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w); if (error) return error; + fa = xfs_da3_header_check(last_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(last_buf, fa); + xfs_trans_brelse(tp, last_buf); + xfs_da_mark_sick(args); + return -EFSCORRUPTED; + } + /* * Copy the last block into the dead buffer and log it. */ @@ -2364,6 +2493,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; + fa = xfs_da3_header_check(sib_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(sib_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } sib_info = sib_buf->b_addr; if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->forw) != last_blkno || @@ -2385,6 +2521,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w); if (error) goto done; + fa = xfs_da3_header_check(sib_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(sib_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } sib_info = sib_buf->b_addr; if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->back) != last_blkno || @@ -2408,6 +2551,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; + fa = xfs_da3_node_header_check(par_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(par_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } par_node = par_buf->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, @@ -2457,6 +2607,13 @@ xfs_da3_swap_lastblock( error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w); if (error) goto done; + fa = xfs_da3_node_header_check(par_buf, args->owner); + if (fa) { + __xfs_buf_mark_corrupt(par_buf, fa); + xfs_da_mark_sick(args); + error = -EFSCORRUPTED; + goto done; + } par_node = par_buf->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) { diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 706baf36e175..354d5d65043e 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -54,17 +54,24 @@ enum xfs_dacmp { */ typedef struct xfs_da_args { struct xfs_da_geometry *geo; /* da block geometry */ - const uint8_t *name; /* string (maybe not NULL terminated) */ - int namelen; /* length of string (maybe no NULL) */ - uint8_t filetype; /* filetype of inode for directories */ + const uint8_t *name; /* string (maybe not NULL terminated) */ + const uint8_t *new_name; /* new attr name */ void *value; /* set of bytes (maybe contain NULLs) */ - int valuelen; /* length of value */ - unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ - unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */ - xfs_dahash_t hashval; /* hash value of name */ - xfs_ino_t inumber; /* input/output inode number */ + void *new_value; /* new xattr value (may contain NULLs) */ struct xfs_inode *dp; /* directory inode to manipulate */ struct xfs_trans *trans; /* current trans (changes over time) */ + + xfs_ino_t inumber; /* input/output inode number */ + xfs_ino_t owner; /* inode that owns the dir/attr data */ + + int valuelen; /* length of value */ + int new_valuelen; /* length of new_value */ + uint8_t filetype; /* filetype of inode for directories */ + uint8_t op_flags; /* operation flags */ + uint8_t attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ + short namelen; /* length of string (maybe no NULL) */ + short new_namelen; /* length of new attr name */ + xfs_dahash_t hashval; /* hash value of name */ xfs_extlen_t total; /* total blocks needed, for 1st bmap */ int whichfork; /* data or attribute fork */ xfs_dablk_t blkno; /* blkno of attr leaf of interest */ @@ -77,7 +84,6 @@ typedef struct xfs_da_args { xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ int rmtvaluelen2; /* remote attr value length in bytes */ - uint32_t op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; @@ -89,10 +95,8 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ #define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ -#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ -#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ -#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ -#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */ +#define XFS_DA_OP_RECOVERY (1u << 5) /* Log recovery operation */ +#define XFS_DA_OP_LOGGED (1u << 6) /* Use intent items to track op */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -100,8 +104,6 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_NOTIME, "NOTIME" }, \ - { XFS_DA_OP_REMOVE, "REMOVE" }, \ { XFS_DA_OP_RECOVERY, "RECOVERY" }, \ { XFS_DA_OP_LOGGED, "LOGGED" } @@ -235,6 +237,8 @@ void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp, struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from); +xfs_failaddr_t xfs_da3_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_da3_node_header_check(struct xfs_buf *bp, xfs_ino_t owner); extern struct kmem_cache *xfs_da_state_cache; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 060e5c96b70f..86de99e2f757 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -714,12 +714,30 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_PARENT_BIT 3 /* parent pointer attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ #define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT) #define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) #define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_PARENT (1u << XFS_ATTR_PARENT_BIT) #define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) -#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) + +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_PARENT) + +/* Private attr namespaces not exposed to userspace */ +#define XFS_ATTR_PRIVATE_NSP_MASK (XFS_ATTR_PARENT) + +#define XFS_ATTR_ONDISK_MASK (XFS_ATTR_NSP_ONDISK_MASK | \ + XFS_ATTR_LOCAL | \ + XFS_ATTR_INCOMPLETE) + +#define XFS_ATTR_NAMESPACE_STR \ + { XFS_ATTR_LOCAL, "local" }, \ + { XFS_ATTR_ROOT, "root" }, \ + { XFS_ATTR_SECURE, "secure" }, \ + { XFS_ATTR_PARENT, "parent" } /* * Alignment for namelist and valuelist entries (since they are mixed @@ -862,9 +880,7 @@ struct xfs_attr3_rmt_hdr { #define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) -#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_has_crc((mp)) ? \ - sizeof(struct xfs_attr3_rmt_hdr) : 0)) +unsigned int xfs_attr3_rmt_buf_space(struct xfs_mount *mp); /* Number of bytes in a directory block. */ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) @@ -875,4 +891,17 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp, struct xfs_da3_blkinfo *hdr3); +/* + * Parent pointer attribute format definition + * + * The xattr name contains the dirent name. + * The xattr value encodes the parent inode number and generation to ease + * opening parents by handle. + * The xattr hashval is xfs_dir2_namehash() ^ p_ino + */ +struct xfs_parent_rec { + __be64 p_ino; + __be32 p_gen; +} __packed; + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c13276095cc0..4a078e07e1a0 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -27,6 +27,7 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trans_priv.h" +#include "xfs_exchmaps.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -1091,7 +1092,11 @@ xfs_defer_ops_continue( ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); /* Lock the captured resources to the new transaction. */ - if (dfc->dfc_held.dr_inos == 2) + if (dfc->dfc_held.dr_inos > 2) { + xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos); + xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos, + XFS_ILOCK_EXCL); + } else if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) @@ -1176,6 +1181,10 @@ xfs_defer_init_item_caches(void) error = xfs_attr_intent_init_cache(); if (error) goto err; + error = xfs_exchmaps_intent_init_cache(); + if (error) + goto err; + return 0; err: xfs_defer_destroy_item_caches(); @@ -1186,6 +1195,7 @@ err: void xfs_defer_destroy_item_caches(void) { + xfs_exchmaps_intent_destroy_cache(); xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 18a9fb92dde8..8b338031e487 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -72,12 +72,18 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; extern const struct xfs_defer_op_type xfs_attr_defer_type; - +extern const struct xfs_defer_op_type xfs_exchmaps_defer_type; /* * Deferred operation item relogging limits. */ -#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ + +/* + * Rename w/ parent pointers can require up to 5 inodes with deferred ops to + * be joined to the transaction: src_dp, target_dp, src_ip, target_ip, and wip. + * These inodes are locked in sorted order by their inode numbers + */ +#define XFS_DEFER_OPS_NR_INODES 5 #define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ /* Resources that must be held across a transaction roll. */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 4821519efad4..457f9a38f850 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -250,11 +250,68 @@ xfs_dir_init( args->geo = dp->i_mount->m_dir_geo; args->dp = dp; args->trans = tp; + args->owner = dp->i_ino; error = xfs_dir2_sf_create(args, pdp->i_ino); kfree(args); return error; } +enum xfs_dir2_fmt +xfs_dir2_format( + struct xfs_da_args *args, + int *error) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + xfs_fileoff_t eof; + + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + *error = 0; + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + return XFS_DIR2_FMT_SF; + + *error = xfs_bmap_last_offset(dp, &eof, XFS_DATA_FORK); + if (*error) + return XFS_DIR2_FMT_ERROR; + + if (eof == XFS_B_TO_FSB(mp, geo->blksize)) { + if (XFS_IS_CORRUPT(mp, dp->i_disk_size != geo->blksize)) { + xfs_da_mark_sick(args); + *error = -EFSCORRUPTED; + return XFS_DIR2_FMT_ERROR; + } + return XFS_DIR2_FMT_BLOCK; + } + if (eof == geo->leafblk + geo->fsbcount) + return XFS_DIR2_FMT_LEAF; + return XFS_DIR2_FMT_NODE; +} + +int +xfs_dir_createname_args( + struct xfs_da_args *args) +{ + int error; + + if (!args->inumber) + args->op_flags |= XFS_DA_OP_JUSTCHECK; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_addname(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_addname(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_addname(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_addname(args); + default: + return error; + } +} + /* * Enter a name in a directory, or check for available space. * If inum is 0, only the available space test is performed. @@ -269,7 +326,6 @@ xfs_dir_createname( { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -295,31 +351,9 @@ xfs_dir_createname( args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - if (!inum) - args->op_flags |= XFS_DA_OP_JUSTCHECK; - - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_addname(args); - goto out_free; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_addname(args); - goto out_free; - } + args->owner = dp->i_ino; - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_addname(args); - else - rval = xfs_dir2_node_addname(args); - -out_free: + rval = xfs_dir_createname_args(args); kfree(args); return rval; } @@ -350,6 +384,34 @@ xfs_dir_cilookup_result( return -EEXIST; } +int +xfs_dir_lookup_args( + struct xfs_da_args *args) +{ + int error; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + error = xfs_dir2_sf_lookup(args); + break; + case XFS_DIR2_FMT_BLOCK: + error = xfs_dir2_block_lookup(args); + break; + case XFS_DIR2_FMT_LEAF: + error = xfs_dir2_leaf_lookup(args); + break; + case XFS_DIR2_FMT_NODE: + error = xfs_dir2_node_lookup(args); + break; + default: + break; + } + + if (error != -EEXIST) + return error; + return 0; +} + /* * Lookup a name in a directory, give back the inode number. * If ci_name is not NULL, returns the actual name in ci_name if it differs @@ -366,7 +428,6 @@ xfs_dir_lookup( { struct xfs_da_args *args; int rval; - bool v; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -383,34 +444,12 @@ xfs_dir_lookup( args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_OKNOENT; + args->owner = dp->i_ino; if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_lookup(args); - goto out_check_rval; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_lookup(args); - goto out_check_rval; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_lookup(args); - else - rval = xfs_dir2_node_lookup(args); - -out_check_rval: - if (rval == -EEXIST) - rval = 0; + rval = xfs_dir_lookup_args(args); if (!rval) { *inum = args->inumber; if (ci_name) { @@ -418,12 +457,31 @@ out_check_rval: ci_name->len = args->valuelen; } } -out_free: xfs_iunlock(dp, lock_mode); kfree(args); return rval; } +int +xfs_dir_removename_args( + struct xfs_da_args *args) +{ + int error; + + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_removename(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_removename(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_removename(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_removename(args); + default: + return error; + } +} + /* * Remove an entry from a directory. */ @@ -431,13 +489,12 @@ int xfs_dir_removename( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, + const struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); @@ -456,30 +513,30 @@ xfs_dir_removename( args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; + args->owner = dp->i_ino; + rval = xfs_dir_removename_args(args); + kfree(args); + return rval; +} - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_removename(args); - goto out_free; - } +int +xfs_dir_replace_args( + struct xfs_da_args *args) +{ + int error; - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_removename(args); - goto out_free; + switch (xfs_dir2_format(args, &error)) { + case XFS_DIR2_FMT_SF: + return xfs_dir2_sf_replace(args); + case XFS_DIR2_FMT_BLOCK: + return xfs_dir2_block_replace(args); + case XFS_DIR2_FMT_LEAF: + return xfs_dir2_leaf_replace(args); + case XFS_DIR2_FMT_NODE: + return xfs_dir2_node_replace(args); + default: + return error; } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_removename(args); - else - rval = xfs_dir2_node_removename(args); -out_free: - kfree(args); - return rval; } /* @@ -495,7 +552,6 @@ xfs_dir_replace( { struct xfs_da_args *args; int rval; - bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -517,28 +573,8 @@ xfs_dir_replace( args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; - - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_replace(args); - goto out_free; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_replace(args); - goto out_free; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_replace(args); - else - rval = xfs_dir2_node_replace(args); -out_free: + args->owner = dp->i_ino; + rval = xfs_dir_replace_args(args); kfree(args); return rval; } @@ -607,57 +643,6 @@ xfs_dir2_grow_inode( } /* - * See if the directory is a single-block form directory. - */ -int -xfs_dir2_isblock( - struct xfs_da_args *args, - bool *isblock) -{ - struct xfs_mount *mp = args->dp->i_mount; - xfs_fileoff_t eof; - int error; - - error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); - if (error) - return error; - - *isblock = false; - if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) - return 0; - - *isblock = true; - if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) { - xfs_da_mark_sick(args); - return -EFSCORRUPTED; - } - return 0; -} - -/* - * See if the directory is a single-leaf form directory. - */ -int -xfs_dir2_isleaf( - struct xfs_da_args *args, - bool *isleaf) -{ - xfs_fileoff_t eof; - int error; - - error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); - if (error) - return error; - - *isleaf = false; - if (eof != args->geo->leafblk + args->geo->fsbcount) - return 0; - - *isleaf = true; - return 0; -} - -/* * Remove the given block from the directory. * This routine is used for data and free blocks, leaf/node are done * by xfs_da_shrink_inode. diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 8497d041f316..6dbe6e9ecb49 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -36,6 +36,16 @@ xfs_dir2_samename( return !memcmp(n1->name, n2->name, n1->len); } +enum xfs_dir2_fmt { + XFS_DIR2_FMT_SF, + XFS_DIR2_FMT_BLOCK, + XFS_DIR2_FMT_LEAF, + XFS_DIR2_FMT_NODE, + XFS_DIR2_FMT_ERROR, +}; + +enum xfs_dir2_fmt xfs_dir2_format(struct xfs_da_args *args, int *error); + /* * Convert inode mode to directory entry filetype */ @@ -58,7 +68,7 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t ino, + const struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t inum, @@ -66,6 +76,11 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name); +int xfs_dir_lookup_args(struct xfs_da_args *args); +int xfs_dir_createname_args(struct xfs_da_args *args); +int xfs_dir_removename_args(struct xfs_da_args *args); +int xfs_dir_replace_args(struct xfs_da_args *args); + /* * Direct call from the bmap code, bypassing the generic directory layer. */ @@ -74,8 +89,6 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock); -extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); @@ -101,6 +114,10 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); +xfs_failaddr_t xfs_dir3_leaf_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_dir3_data_header_check(struct xfs_buf *bp, xfs_ino_t owner); +xfs_failaddr_t xfs_dir3_block_header_check(struct xfs_buf *bp, xfs_ino_t owner); + extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index a2da007adb46..0f93ed1a4a74 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -115,17 +115,20 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_struct = xfs_dir3_block_verify, }; -static xfs_failaddr_t +xfs_failaddr_t xfs_dir3_block_header_check( - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (be64_to_cpu(hdr3->owner) != dp->i_ino) + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->owner) != owner) return __this_address; } @@ -136,6 +139,7 @@ int xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; @@ -148,7 +152,7 @@ xfs_dir3_block_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_block_header_check(dp, *bpp); + fa = xfs_dir3_block_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); @@ -163,12 +167,13 @@ xfs_dir3_block_read( static void xfs_dir3_block_init( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_buf *bp, - struct xfs_inode *dp) + struct xfs_da_args *args, + struct xfs_buf *bp) { - struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; bp->b_ops = &xfs_dir3_block_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); @@ -177,7 +182,7 @@ xfs_dir3_block_init( memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; @@ -382,7 +387,7 @@ xfs_dir2_block_addname( tp = args->trans; /* Read the (one and only) directory block into bp. */ - error = xfs_dir3_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; @@ -697,7 +702,7 @@ xfs_dir2_block_lookup_int( dp = args->dp; tp = args->trans; - error = xfs_dir3_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; @@ -981,7 +986,8 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + args->geo->datablk, 0, &dbp); if (error) return error; } @@ -1009,7 +1015,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - xfs_dir3_block_init(mp, tp, dbp, dp); + xfs_dir3_block_init(args, dbp); needlog = 1; needscan = 0; @@ -1129,7 +1135,7 @@ xfs_dir2_sf_to_block( error = xfs_dir3_data_init(args, blkno, &bp); if (error) goto out_free; - xfs_dir3_block_init(mp, tp, bp, dp); + xfs_dir3_block_init(args, bp); hdr = bp->b_addr; /* @@ -1169,7 +1175,7 @@ xfs_dir2_sf_to_block( * Create entry for . */ dep = bp->b_addr + offset; - dep->inumber = cpu_to_be64(dp->i_ino); + dep->inumber = cpu_to_be64(args->owner); dep->namelen = 1; dep->name[0] = '.'; xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 7a6d965bea71..ea0b9628df18 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -395,17 +395,20 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .verify_write = xfs_dir3_data_write_verify, }; -static xfs_failaddr_t +xfs_failaddr_t xfs_dir3_data_header_check( - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; - if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + if (hdr3->hdr.magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.owner) != owner) return __this_address; } @@ -416,6 +419,7 @@ int xfs_dir3_data_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp) @@ -429,7 +433,7 @@ xfs_dir3_data_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_data_header_check(dp, *bpp); + fa = xfs_dir3_data_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); @@ -725,7 +729,7 @@ xfs_dir3_data_init( memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->owner = cpu_to_be64(dp->i_ino); + hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); } else diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 08dda5ce9d91..71c2f22a3f6e 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -208,6 +208,29 @@ xfs_dir3_leaf_verify( return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true); } +xfs_failaddr_t +xfs_dir3_leaf_header_check( + struct xfs_buf *bp, + xfs_ino_t owner) +{ + struct xfs_mount *mp = bp->b_mount; + + if (xfs_has_crc(mp)) { + struct xfs_dir3_leaf *hdr3 = bp->b_addr; + + if (hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) && + hdr3->hdr.info.hdr.magic != + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) + return __this_address; + + if (be64_to_cpu(hdr3->hdr.info.owner) != owner) + return __this_address; + } + + return NULL; +} + static void xfs_dir3_leaf_read_verify( struct xfs_buf *bp) @@ -271,32 +294,60 @@ int xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_dir3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); - return err; + return 0; } int xfs_dir3_leafn_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); - if (!err && tp && *bpp) + if (err || !(*bpp)) + return err; + + fa = xfs_dir3_leaf_header_check(*bpp, owner); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); + return -EFSCORRUPTED; + } + + if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); - return err; + return 0; } /* @@ -304,12 +355,12 @@ xfs_dir3_leafn_read( */ static void xfs_dir3_leaf_init( - struct xfs_mount *mp, - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, - xfs_ino_t owner, uint16_t type) { + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_trans *tp = args->trans; struct xfs_dir2_leaf *leaf = bp->b_addr; ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); @@ -323,7 +374,7 @@ xfs_dir3_leaf_init( ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - leaf3->info.owner = cpu_to_be64(owner); + leaf3->info.owner = cpu_to_be64(args->owner); uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { memset(leaf, 0, sizeof(*leaf)); @@ -356,7 +407,6 @@ xfs_dir3_leaf_get_buf( { struct xfs_inode *dp = args->dp; struct xfs_trans *tp = args->trans; - struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; int error; @@ -369,7 +419,7 @@ xfs_dir3_leaf_get_buf( if (error) return error; - xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_init(args, bp, magic); xfs_dir3_leaf_log_header(args, bp); if (magic == XFS_DIR2_LEAF1_MAGIC) xfs_dir3_leaf_log_tail(args, bp); @@ -647,7 +697,8 @@ xfs_dir2_leaf_addname( trace_xfs_dir2_leaf_addname(args); - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk, + &lbp); if (error) return error; @@ -834,9 +885,9 @@ xfs_dir2_leaf_addname( * Already had space in some data block. * Just read that one in. */ - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, use_block), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, use_block), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1238,7 +1289,8 @@ xfs_dir2_leaf_lookup_int( tp = args->trans; mp = dp->i_mount; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->owner, args->geo->leafblk, + &lbp); if (error) return error; @@ -1276,9 +1328,9 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, newdb), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, newdb), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1318,9 +1370,9 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, cidb), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, cidb), 0, + &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1614,7 +1666,8 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(geo, db), 0, &dbp); if (error) return error; @@ -1753,7 +1806,8 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); + error = xfs_dir2_free_read(tp, dp, args->owner, args->geo->freeblk, + &fbp); if (error) return error; xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index be0b8834028c..fe8d4fa13128 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -175,11 +175,11 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = { /* Everything ok in the free block header? */ static xfs_failaddr_t xfs_dir3_free_header_check( - struct xfs_inode *dp, - xfs_dablk_t fbno, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_ino_t owner, + xfs_dablk_t fbno) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_mount *mp = bp->b_mount; int maxbests = mp->m_dir_geo->free_max_bests; unsigned int firstdb; @@ -195,7 +195,7 @@ xfs_dir3_free_header_check( return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) return __this_address; - if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + if (be64_to_cpu(hdr3->hdr.owner) != owner) return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; @@ -214,6 +214,7 @@ static int __xfs_dir3_free_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, unsigned int flags, struct xfs_buf **bpp) @@ -227,7 +228,7 @@ __xfs_dir3_free_read( return err; /* Check things that we can't do in the verifier. */ - fa = xfs_dir3_free_header_check(dp, fbno, *bpp); + fa = xfs_dir3_free_header_check(*bpp, owner, fbno); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); @@ -299,20 +300,23 @@ int xfs_dir2_free_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp); + return __xfs_dir3_free_read(tp, dp, owner, fbno, 0, bpp); } static int xfs_dir2_free_try_read( struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp); + return __xfs_dir3_free_read(tp, dp, owner, fbno, XFS_DABUF_MAP_HOLE_OK, + bpp); } static int @@ -349,7 +353,7 @@ xfs_dir3_free_get_buf( hdr.magic = XFS_DIR3_FREE_MAGIC; hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + hdr3->hdr.owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else hdr.magic = XFS_DIR2_FREE_MAGIC; @@ -717,7 +721,7 @@ xfs_dir2_leafn_lookup_for_addname( if (curbp) xfs_trans_brelse(tp, curbp); - error = xfs_dir2_free_read(tp, dp, + error = xfs_dir2_free_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, newfdb), &curbp); @@ -863,7 +867,7 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_dir3_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, newdb), 0, &curbp); @@ -1356,8 +1360,8 @@ xfs_dir2_leafn_remove( * read in the free block. */ fdb = xfs_dir2_db_to_fdb(geo, db); - error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb), - &fbp); + error = xfs_dir2_free_read(tp, dp, args->owner, + xfs_dir2_db_to_da(geo, fdb), &fbp); if (error) return error; free = fbp->b_addr; @@ -1562,7 +1566,8 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp); + error = xfs_dir3_leafn_read(state->args->trans, dp, + state->args->owner, blkno, &bp); if (error) return error; @@ -1715,7 +1720,7 @@ xfs_dir2_node_add_datablk( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(args->geo, *dbno); - error = xfs_dir2_free_try_read(tp, dp, + error = xfs_dir2_free_try_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) return error; @@ -1862,7 +1867,7 @@ xfs_dir2_node_find_freeblk( * so this might not succeed. This should be really rare, so * there's no reason to avoid it. */ - error = xfs_dir2_free_try_read(tp, dp, + error = xfs_dir2_free_try_read(tp, dp, args->owner, xfs_dir2_db_to_da(args->geo, fbno), &fbp); if (error) @@ -1948,9 +1953,8 @@ xfs_dir2_node_addname_int( &freehdr, &findex); } else { /* Read the data block in. */ - error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(args->geo, dbno), - 0, &dbp); + error = xfs_dir3_data_read(tp, dp, args->owner, + xfs_dir2_db_to_da(args->geo, dbno), 0, &dbp); } if (error) return error; @@ -2302,7 +2306,7 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - error = xfs_dir2_free_try_read(tp, dp, fo, &bp); + error = xfs_dir2_free_try_read(tp, dp, args->owner, fo, &bp); if (error) return error; /* diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 1db2e60ba827..3befb32509fa 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -50,8 +50,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, /* xfs_dir2_block.c */ -extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_buf **bpp); +int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, struct xfs_buf **bpp); extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); @@ -78,7 +78,8 @@ extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t bno, unsigned int flags, + struct xfs_buf **bpp); int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, unsigned int flags); @@ -95,9 +96,9 @@ void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp, void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to, struct xfs_dir3_icleaf_hdr *from); int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); @@ -154,8 +155,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args); extern int xfs_dir2_node_replace(struct xfs_da_args *args); extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, int *rvalp); -extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t fbno, struct xfs_buf **bpp); +int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_ino_t owner, xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 01a9e86b3037..7002d7676a78 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -63,7 +63,8 @@ #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 #define XFS_ERRTAG_WB_DELAY_MS 42 #define XFS_ERRTAG_WRITE_DELAY_MS 43 -#define XFS_ERRTAG_MAX 44 +#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 +#define XFS_ERRTAG_MAX 45 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -111,5 +112,6 @@ #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #define XFS_RANDOM_WB_DELAY_MS 3000 #define XFS_RANDOM_WRITE_DELAY_MS 3000 +#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c new file mode 100644 index 000000000000..2021396651de --- /dev/null +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -0,0 +1,1235 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_quota.h" +#include "xfs_exchmaps.h" +#include "xfs_trace.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_health.h" +#include "xfs_exchmaps_item.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr.h" +#include "xfs_dir2_priv.h" +#include "xfs_dir2.h" +#include "xfs_symlink_remote.h" + +struct kmem_cache *xfs_exchmaps_intent_cache; + +/* bmbt mappings adjacent to a pair of records. */ +struct xfs_exchmaps_adjacent { + struct xfs_bmbt_irec left1; + struct xfs_bmbt_irec right1; + struct xfs_bmbt_irec left2; + struct xfs_bmbt_irec right2; +}; + +#define ADJACENT_INIT { \ + .left1 = { .br_startblock = HOLESTARTBLOCK }, \ + .right1 = { .br_startblock = HOLESTARTBLOCK }, \ + .left2 = { .br_startblock = HOLESTARTBLOCK }, \ + .right2 = { .br_startblock = HOLESTARTBLOCK }, \ +} + +/* Information to reset reflink flag / CoW fork state after an exchange. */ + +/* + * If the reflink flag is set on either inode, make sure it has an incore CoW + * fork, since all reflink inodes must have them. If there's a CoW fork and it + * has mappings in it, make sure the inodes are tagged appropriately so that + * speculative preallocations can be GC'd if we run low of space. + */ +static inline void +xfs_exchmaps_ensure_cowfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *cfork; + + if (xfs_is_reflink_inode(ip)) + xfs_ifork_init_cow(ip); + + cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); + if (!cfork) + return; + if (cfork->if_bytes > 0) + xfs_inode_set_cowblocks_tag(ip); + else + xfs_inode_clear_cowblocks_tag(ip); +} + +/* + * Adjust the on-disk inode size upwards if needed so that we never add + * mappings into the file past EOF. This is crucial so that log recovery won't + * get confused by the sudden appearance of post-eof mappings. + */ +STATIC void +xfs_exchmaps_update_size( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fsize_t new_isize) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_fsize_t len; + + if (new_isize < 0) + return; + + len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount), + new_isize); + + if (len <= ip->i_disk_size) + return; + + trace_xfs_exchmaps_update_inode_size(ip, len); + + ip->i_disk_size = len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Advance the incore state tracking after exchanging a mapping. */ +static inline void +xmi_advance( + struct xfs_exchmaps_intent *xmi, + const struct xfs_bmbt_irec *irec) +{ + xmi->xmi_startoff1 += irec->br_blockcount; + xmi->xmi_startoff2 += irec->br_blockcount; + xmi->xmi_blockcount -= irec->br_blockcount; +} + +/* Do we still have more mappings to exchange? */ +static inline bool +xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi) +{ + return xmi->xmi_blockcount > 0; +} + +/* Do we have post-operation cleanups to perform? */ +static inline bool +xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi) +{ + return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK | + XFS_EXCHMAPS_CLEAR_INO2_REFLINK | + __XFS_EXCHMAPS_INO2_SHORTFORM); +} + +/* Check all mappings to make sure we can actually exchange them. */ +int +xfs_exchmaps_check_forks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + struct xfs_ifork *ifp1, *ifp2; + int whichfork = xfs_exchmaps_reqfork(req); + + /* No fork? */ + ifp1 = xfs_ifork_ptr(req->ip1, whichfork); + ifp2 = xfs_ifork_ptr(req->ip2, whichfork); + if (!ifp1 || !ifp2) + return -EINVAL; + + /* We don't know how to exchange local format forks. */ + if (ifp1->if_format == XFS_DINODE_FMT_LOCAL || + ifp2->if_format == XFS_DINODE_FMT_LOCAL) + return -EINVAL; + + return 0; +} + +#ifdef CONFIG_XFS_QUOTA +/* Log the actual updates to the quota accounting. */ +static inline void +xfs_exchmaps_update_quota( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2) +{ + int64_t ip1_delta = 0, ip2_delta = 0; + unsigned int qflag; + + qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT : + XFS_TRANS_DQ_BCOUNT; + + if (xfs_bmap_is_real_extent(irec1)) { + ip1_delta -= irec1->br_blockcount; + ip2_delta += irec1->br_blockcount; + } + + if (xfs_bmap_is_real_extent(irec2)) { + ip1_delta += irec2->br_blockcount; + ip2_delta -= irec2->br_blockcount; + } + + xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta); + xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta); +} +#else +# define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0) +#endif + +/* Decide if we want to skip this mapping from file1. */ +static inline bool +xfs_exchmaps_can_skip_mapping( + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = xmi->xmi_ip1->i_mount; + + /* Do not skip this mapping if the caller did not tell us to. */ + if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN)) + return false; + + /* Do not skip mapped, written mappings. */ + if (xfs_bmap_is_written_extent(irec)) + return false; + + /* + * The mapping is unwritten or a hole. It cannot be a delalloc + * reservation because we already excluded those. It cannot be an + * unwritten extent with dirty page cache because we flushed the page + * cache. For files where the allocation unit is 1FSB (files on the + * data dev, rt files if the extent size is 1FSB), we can safely + * skip this mapping. + */ + if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1)) + return true; + + /* + * For a realtime file with a multi-fsb allocation unit, the decision + * is trickier because we can only swap full allocation units. + * Unwritten mappings can appear in the middle of an rtx if the rtx is + * partially written, but they can also appear for preallocations. + * + * If the mapping is a hole, skip it entirely. Holes should align with + * rtx boundaries. + */ + if (!xfs_bmap_is_real_extent(irec)) + return true; + + /* + * All mappings below this point are unwritten. + * + * - If the beginning is not aligned to an rtx, trim the end of the + * mapping so that it does not cross an rtx boundary, and swap it. + * + * - If both ends are aligned to an rtx, skip the entire mapping. + */ + if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) { + xfs_fileoff_t new_end; + + new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize); + irec->br_blockcount = min(irec->br_blockcount, + new_end - irec->br_startoff); + return false; + } + if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize)) + return true; + + /* + * All mappings below this point are unwritten, start on an rtx + * boundary, and do not end on an rtx boundary. + * + * - If the mapping is longer than one rtx, trim the end of the mapping + * down to an rtx boundary and skip it. + * + * - The mapping is shorter than one rtx. Swap it. + */ + if (irec->br_blockcount > mp->m_sb.sb_rextsize) { + xfs_fileoff_t new_end; + + new_end = rounddown_64(irec->br_startoff + irec->br_blockcount, + mp->m_sb.sb_rextsize); + irec->br_blockcount = new_end - irec->br_startoff; + return true; + } + + return false; +} + +/* + * Walk forward through the file ranges in @xmi until we find two different + * mappings to exchange. If there is work to do, return the mappings; + * otherwise we've reached the end of the range and xmi_blockcount will be + * zero. + * + * If the walk skips over a pair of mappings to the same storage, save them as + * the left records in @adj (if provided) so that the simulation phase can + * avoid an extra lookup. + */ +static int +xfs_exchmaps_find_mappings( + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2, + struct xfs_exchmaps_adjacent *adj) +{ + int nimaps; + int bmap_flags; + int error; + + bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi)); + + for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) { + /* Read mapping from the first file */ + nimaps = 1; + error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1, + xmi->xmi_blockcount, irec1, &nimaps, + bmap_flags); + if (error) + return error; + if (nimaps != 1 || + irec1->br_startblock == DELAYSTARTBLOCK || + irec1->br_startoff != xmi->xmi_startoff1) { + /* + * We should never get no mapping or a delalloc mapping + * or something that doesn't match what we asked for, + * since the caller flushed both inodes and we hold the + * ILOCKs for both inodes. + */ + ASSERT(0); + return -EINVAL; + } + + if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) { + trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1); + continue; + } + + /* Read mapping from the second file */ + nimaps = 1; + error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2, + irec1->br_blockcount, irec2, &nimaps, + bmap_flags); + if (error) + return error; + if (nimaps != 1 || + irec2->br_startblock == DELAYSTARTBLOCK || + irec2->br_startoff != xmi->xmi_startoff2) { + /* + * We should never get no mapping or a delalloc mapping + * or something that doesn't match what we asked for, + * since the caller flushed both inodes and we hold the + * ILOCKs for both inodes. + */ + ASSERT(0); + return -EINVAL; + } + + /* + * We can only exchange as many blocks as the smaller of the + * two mapping maps. + */ + irec1->br_blockcount = min(irec1->br_blockcount, + irec2->br_blockcount); + + trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1); + trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2); + + /* We found something to exchange, so return it. */ + if (irec1->br_startblock != irec2->br_startblock) + return 0; + + /* + * Two mappings pointing to the same physical block must not + * have different states; that's filesystem corruption. Move + * on to the next mapping if they're both holes or both point + * to the same physical space extent. + */ + if (irec1->br_state != irec2->br_state) { + xfs_bmap_mark_sick(xmi->xmi_ip1, + xfs_exchmaps_whichfork(xmi)); + xfs_bmap_mark_sick(xmi->xmi_ip2, + xfs_exchmaps_whichfork(xmi)); + return -EFSCORRUPTED; + } + + /* + * Save the mappings if we're estimating work and skipping + * these identical mappings. + */ + if (adj) { + memcpy(&adj->left1, irec1, sizeof(*irec1)); + memcpy(&adj->left2, irec2, sizeof(*irec2)); + } + } + + return 0; +} + +/* Exchange these two mappings. */ +static void +xfs_exchmaps_one_step( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2) +{ + int whichfork = xfs_exchmaps_whichfork(xmi); + + xfs_exchmaps_update_quota(tp, xmi, irec1, irec2); + + /* Remove both mappings. */ + xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1); + xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2); + + /* + * Re-add both mappings. We exchange the file offsets between the two + * maps and add the opposite map, which has the effect of filling the + * logical offsets we just unmapped, but with with the physical mapping + * information exchanged. + */ + swap(irec1->br_startoff, irec2->br_startoff); + xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2); + xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1); + + /* Make sure we're not adding mappings past EOF. */ + if (whichfork == XFS_DATA_FORK) { + xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2, + xmi->xmi_isize1); + xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1, + xmi->xmi_isize2); + } + + /* + * Advance our cursor and exit. The caller (either defer ops or log + * recovery) will log the XMD item, and if *blockcount is nonzero, it + * will log a new XMI item for the remainder and call us back. + */ + xmi_advance(xmi, irec1); +} + +/* Convert inode2's leaf attr fork back to shortform, if possible.. */ +STATIC int +xfs_exchmaps_attr_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_da_args args = { + .dp = xmi->xmi_ip2, + .geo = tp->t_mountp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .trans = tp, + .owner = xmi->xmi_ip2->i_ino, + }; + struct xfs_buf *bp; + int forkoff; + int error; + + if (!xfs_attr_is_leaf(xmi->xmi_ip2)) + return 0; + + error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0, + &bp); + if (error) + return error; + + forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2); + if (forkoff == 0) + return 0; + + return xfs_attr3_leaf_to_shortform(bp, &args, forkoff); +} + +/* Convert inode2's block dir fork back to shortform, if possible.. */ +STATIC int +xfs_exchmaps_dir_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_da_args args = { + .dp = xmi->xmi_ip2, + .geo = tp->t_mountp->m_dir_geo, + .whichfork = XFS_DATA_FORK, + .trans = tp, + .owner = xmi->xmi_ip2->i_ino, + }; + struct xfs_dir2_sf_hdr sfh; + struct xfs_buf *bp; + int size; + int error = 0; + + if (xfs_dir2_format(&args, &error) != XFS_DIR2_FMT_BLOCK) + return error; + + error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp); + if (error) + return error; + + size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh); + if (size > xfs_inode_data_fork_size(xmi->xmi_ip2)) + return 0; + + return xfs_dir2_block_to_sf(&args, bp, size, &sfh); +} + +/* Convert inode2's remote symlink target back to shortform, if possible. */ +STATIC int +xfs_exchmaps_link_to_sf( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_inode *ip = xmi->xmi_ip2; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + char *buf; + int error; + + if (ifp->if_format == XFS_DINODE_FMT_LOCAL || + ip->i_disk_size > xfs_inode_data_fork_size(ip)) + return 0; + + /* Read the current symlink target into a buffer. */ + buf = kmalloc(ip->i_disk_size + 1, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + if (!buf) { + ASSERT(0); + return -ENOMEM; + } + + error = xfs_symlink_remote_read(ip, buf); + if (error) + goto free; + + /* Remove the blocks. */ + error = xfs_symlink_remote_truncate(tp, ip); + if (error) + goto free; + + /* Convert fork to local format and log our changes. */ + xfs_idestroy_fork(ifp); + ifp->if_bytes = 0; + ifp->if_format = XFS_DINODE_FMT_LOCAL; + xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size); + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); +free: + kfree(buf); + return error; +} + +/* Clear the reflink flag after an exchange. */ +static inline void +xfs_exchmaps_clear_reflink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + trace_xfs_reflink_unset_inode_flag(ip); + + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Finish whatever work might come after an exchange operation. */ +static int +xfs_exchmaps_do_postop_work( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) { + int error = 0; + + if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) + error = xfs_exchmaps_attr_to_sf(tp, xmi); + else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode)) + error = xfs_exchmaps_dir_to_sf(tp, xmi); + else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) + error = xfs_exchmaps_link_to_sf(tp, xmi); + xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM; + if (error) + return error; + } + + if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) { + xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1); + xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK; + } + + if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) { + xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2); + xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK; + } + + return 0; +} + +/* Finish one step in a mapping exchange operation, possibly relogging. */ +int +xfs_exchmaps_finish_one( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + struct xfs_bmbt_irec irec1, irec2; + int error; + + if (xmi_has_more_exchange_work(xmi)) { + /* + * If the operation state says that some range of the files + * have not yet been exchanged, look for mappings in that range + * to exchange. If we find some mappings, exchange them. + */ + error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL); + if (error) + return error; + + if (xmi_has_more_exchange_work(xmi)) + xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2); + + /* + * If the caller asked us to exchange the file sizes after the + * exchange and either we just exchanged the last mappings in + * the range or we didn't find anything to exchange, update the + * ondisk file sizes. + */ + if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) && + !xmi_has_more_exchange_work(xmi)) { + xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1; + xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2; + + xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE); + } + } else if (xmi_has_postop_work(xmi)) { + /* + * Now that we're finished with the exchange operation, + * complete the post-op cleanup work. + */ + error = xfs_exchmaps_do_postop_work(tp, xmi); + if (error) + return error; + } + + if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) + return -EIO; + + /* If we still have work to do, ask for a new transaction. */ + if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) { + trace_xfs_exchmaps_defer(tp->t_mountp, xmi); + return -EAGAIN; + } + + /* + * If we reach here, we've finished all the exchange work and the post + * operation work. The last thing we need to do before returning to + * the caller is to make sure that COW forks are set up correctly. + */ + if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) { + xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1); + xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2); + } + + return 0; +} + +/* + * Compute the amount of bmbt blocks we should reserve for each file. In the + * worst case, each exchange will fill a hole with a new mapping, which could + * result in a btree split every time we add a new leaf block. + */ +static inline uint64_t +xfs_exchmaps_bmbt_blocks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) * + XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req)); +} + +/* Compute the space we should reserve for the rmap btree expansions. */ +static inline uint64_t +xfs_exchmaps_rmapbt_blocks( + struct xfs_mount *mp, + const struct xfs_exchmaps_req *req) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + if (XFS_IS_REALTIME_INODE(req->ip1)) + return 0; + + return howmany_64(req->nr_exchanges, + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * + XFS_RMAPADD_SPACE_RES(mp); +} + +/* Estimate the bmbt and rmapbt overhead required to exchange mappings. */ +int +xfs_exchmaps_estimate_overhead( + struct xfs_exchmaps_req *req) +{ + struct xfs_mount *mp = req->ip1->i_mount; + xfs_filblks_t bmbt_blocks; + xfs_filblks_t rmapbt_blocks; + xfs_filblks_t resblks = req->resblks; + + /* + * Compute the number of bmbt and rmapbt blocks we might need to handle + * the estimated number of exchanges. + */ + bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req); + rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req); + + trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks); + + /* Make sure the change in file block count doesn't overflow. */ + if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount)) + return -EFBIG; + if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount)) + return -EFBIG; + + /* + * Add together the number of blocks we need to handle btree growth, + * then add it to the number of blocks we need to reserve to this + * transaction. + */ + if (check_add_overflow(resblks, bmbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, bmbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) + return -ENOSPC; + if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) + return -ENOSPC; + + /* Can't actually reserve more than UINT_MAX blocks. */ + if (req->resblks > UINT_MAX) + return -ENOSPC; + + req->resblks = resblks; + trace_xfs_exchmaps_final_estimate(req); + return 0; +} + +/* Decide if we can merge two real mappings. */ +static inline bool +xmi_can_merge( + const struct xfs_bmbt_irec *b1, + const struct xfs_bmbt_irec *b2) +{ + /* Don't merge holes. */ + if (b1->br_startblock == HOLESTARTBLOCK || + b2->br_startblock == HOLESTARTBLOCK) + return false; + + /* We don't merge holes. */ + if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2)) + return false; + + if (b1->br_startoff + b1->br_blockcount == b2->br_startoff && + b1->br_startblock + b1->br_blockcount == b2->br_startblock && + b1->br_state == b2->br_state && + b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + return true; + + return false; +} + +/* + * Decide if we can merge three mappings. Caller must ensure all three + * mappings must not be holes or delalloc reservations. + */ +static inline bool +xmi_can_merge_all( + const struct xfs_bmbt_irec *l, + const struct xfs_bmbt_irec *m, + const struct xfs_bmbt_irec *r) +{ + xfs_filblks_t new_len; + + new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount; + return new_len <= XFS_MAX_BMBT_EXTLEN; +} + +#define CLEFT_CONTIG 0x01 +#define CRIGHT_CONTIG 0x02 +#define CHOLE 0x04 +#define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG) + +#define NLEFT_CONTIG 0x10 +#define NRIGHT_CONTIG 0x20 +#define NHOLE 0x40 +#define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG) + +/* Estimate the effect of a single exchange on mapping count. */ +static inline int +xmi_delta_nextents_step( + struct xfs_mount *mp, + const struct xfs_bmbt_irec *left, + const struct xfs_bmbt_irec *curr, + const struct xfs_bmbt_irec *new, + const struct xfs_bmbt_irec *right) +{ + bool lhole, rhole, chole, nhole; + unsigned int state = 0; + int ret = 0; + + lhole = left->br_startblock == HOLESTARTBLOCK; + rhole = right->br_startblock == HOLESTARTBLOCK; + chole = curr->br_startblock == HOLESTARTBLOCK; + nhole = new->br_startblock == HOLESTARTBLOCK; + + if (chole) + state |= CHOLE; + if (!lhole && !chole && xmi_can_merge(left, curr)) + state |= CLEFT_CONTIG; + if (!rhole && !chole && xmi_can_merge(curr, right)) + state |= CRIGHT_CONTIG; + if ((state & CBOTH_CONTIG) == CBOTH_CONTIG && + !xmi_can_merge_all(left, curr, right)) + state &= ~CRIGHT_CONTIG; + + if (nhole) + state |= NHOLE; + if (!lhole && !nhole && xmi_can_merge(left, new)) + state |= NLEFT_CONTIG; + if (!rhole && !nhole && xmi_can_merge(new, right)) + state |= NRIGHT_CONTIG; + if ((state & NBOTH_CONTIG) == NBOTH_CONTIG && + !xmi_can_merge_all(left, new, right)) + state &= ~NRIGHT_CONTIG; + + switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) { + case CLEFT_CONTIG | CRIGHT_CONTIG: + /* + * left/curr/right are the same mapping, so deleting curr + * causes 2 new mappings to be created. + */ + ret += 2; + break; + case 0: + /* + * curr is not contiguous with any mapping, so we remove curr + * completely + */ + ret--; + break; + case CHOLE: + /* hole, do nothing */ + break; + case CLEFT_CONTIG: + case CRIGHT_CONTIG: + /* trim either left or right, no change */ + break; + } + + switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) { + case NLEFT_CONTIG | NRIGHT_CONTIG: + /* + * left/curr/right will become the same mapping, so adding + * curr causes the deletion of right. + */ + ret--; + break; + case 0: + /* new is not contiguous with any mapping */ + ret++; + break; + case NHOLE: + /* hole, do nothing. */ + break; + case NLEFT_CONTIG: + case NRIGHT_CONTIG: + /* new is absorbed into left or right, no change */ + break; + } + + trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret, + state); + return ret; +} + +/* Make sure we don't overflow the extent (mapping) counters. */ +static inline int +xmi_ensure_delta_nextents( + struct xfs_exchmaps_req *req, + struct xfs_inode *ip, + int64_t delta) +{ + struct xfs_mount *mp = ip->i_mount; + int whichfork = xfs_exchmaps_reqfork(req); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + uint64_t new_nextents; + xfs_extnum_t max_nextents; + + if (delta < 0) + return 0; + + /* + * It's always an error if the delta causes integer overflow. delta + * needs an explicit cast here to avoid warnings about implicit casts + * coded into the overflow check. + */ + if (check_add_overflow(ifp->if_nextents, (uint64_t)delta, + &new_nextents)) + return -EFBIG; + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + new_nextents > 10) + return -EFBIG; + + /* + * We always promote both inodes to have large extent counts if the + * superblock feature is enabled, so we only need to check against the + * theoretical maximum. + */ + max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), + whichfork); + if (new_nextents > max_nextents) + return -EFBIG; + + return 0; +} + +/* Find the next mapping after irec. */ +static inline int +xmi_next( + struct xfs_inode *ip, + int bmap_flags, + const struct xfs_bmbt_irec *irec, + struct xfs_bmbt_irec *nrec) +{ + xfs_fileoff_t off; + xfs_filblks_t blockcount; + int nimaps = 1; + int error; + + off = irec->br_startoff + irec->br_blockcount; + blockcount = XFS_MAX_FILEOFF - off; + error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags); + if (error) + return error; + if (nrec->br_startblock == DELAYSTARTBLOCK || + nrec->br_startoff != off) { + /* + * If we don't get the mapping we want, return a zero-length + * mapping, which our estimator function will pretend is a hole. + * We shouldn't get delalloc reservations. + */ + nrec->br_startblock = HOLESTARTBLOCK; + } + + return 0; +} + +int __init +xfs_exchmaps_intent_init_cache(void) +{ + xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent", + sizeof(struct xfs_exchmaps_intent), + 0, 0, NULL); + + return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_exchmaps_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_exchmaps_intent_cache); + xfs_exchmaps_intent_cache = NULL; +} + +/* + * Decide if we will exchange the reflink flags between the two files after the + * exchange. The only time we want to do this is if we're exchanging all + * mappings under EOF and the inode reflink flags have different states. + */ +static inline bool +xmi_can_exchange_reflink_flags( + const struct xfs_exchmaps_req *req, + unsigned int reflink_state) +{ + struct xfs_mount *mp = req->ip1->i_mount; + + if (hweight32(reflink_state) != 1) + return false; + if (req->startoff1 != 0 || req->startoff2 != 0) + return false; + if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size)) + return false; + if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size)) + return false; + return true; +} + + +/* Allocate and initialize a new incore intent item from a request. */ +struct xfs_exchmaps_intent * +xfs_exchmaps_init_intent( + const struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + unsigned int rs = 0; + + xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&xmi->xmi_list); + xmi->xmi_ip1 = req->ip1; + xmi->xmi_ip2 = req->ip2; + xmi->xmi_startoff1 = req->startoff1; + xmi->xmi_startoff2 = req->startoff2; + xmi->xmi_blockcount = req->blockcount; + xmi->xmi_isize1 = xmi->xmi_isize2 = -1; + xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS; + + if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) { + xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; + return xmi; + } + + if (req->flags & XFS_EXCHMAPS_SET_SIZES) { + xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES; + xmi->xmi_isize1 = req->ip2->i_disk_size; + xmi->xmi_isize2 = req->ip1->i_disk_size; + } + + /* Record the state of each inode's reflink flag before the op. */ + if (xfs_is_reflink_inode(req->ip1)) + rs |= 1; + if (xfs_is_reflink_inode(req->ip2)) + rs |= 2; + + /* + * Figure out if we're clearing the reflink flags (which effectively + * exchanges them) after the operation. + */ + if (xmi_can_exchange_reflink_flags(req, rs)) { + if (rs & 1) + xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK; + if (rs & 2) + xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK; + } + + if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) || + S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) + xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; + + return xmi; +} + +/* + * Estimate the number of exchange operations and the number of file blocks + * in each file that will be affected by the exchange operation. + */ +int +xfs_exchmaps_estimate( + struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + struct xfs_bmbt_irec irec1, irec2; + struct xfs_exchmaps_adjacent adj = ADJACENT_INIT; + xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0; + int64_t d_nexts1, d_nexts2; + int bmap_flags; + int error; + + ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS)); + + bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req)); + xmi = xfs_exchmaps_init_intent(req); + + /* + * To guard against the possibility of overflowing the extent counters, + * we have to estimate an upper bound on the potential increase in that + * counter. We can split the mapping at each end of the range, and for + * each step of the exchange we can split the mapping that we're + * working on if the mappings do not align. + */ + d_nexts1 = d_nexts2 = 3; + + while (xmi_has_more_exchange_work(xmi)) { + /* + * Walk through the file ranges until we find something to + * exchange. Because we're simulating the exchange, pass in + * adj to capture skipped mappings for correct estimation of + * bmbt record merges. + */ + error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj); + if (error) + goto out_free; + if (!xmi_has_more_exchange_work(xmi)) + break; + + /* Update accounting. */ + if (xfs_bmap_is_real_extent(&irec1)) + ip1_blocks += irec1.br_blockcount; + if (xfs_bmap_is_real_extent(&irec2)) + ip2_blocks += irec2.br_blockcount; + req->nr_exchanges++; + + /* Read the next mappings from both files. */ + error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1); + if (error) + goto out_free; + + error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2); + if (error) + goto out_free; + + /* Update extent count deltas. */ + d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount, + &adj.left1, &irec1, &irec2, &adj.right1); + + d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount, + &adj.left2, &irec2, &irec1, &adj.right2); + + /* Now pretend we exchanged the mappings. */ + if (xmi_can_merge(&adj.left2, &irec1)) + adj.left2.br_blockcount += irec1.br_blockcount; + else + memcpy(&adj.left2, &irec1, sizeof(irec1)); + + if (xmi_can_merge(&adj.left1, &irec2)) + adj.left1.br_blockcount += irec2.br_blockcount; + else + memcpy(&adj.left1, &irec2, sizeof(irec2)); + + xmi_advance(xmi, &irec1); + } + + /* Account for the blocks that are being exchanged. */ + if (XFS_IS_REALTIME_INODE(req->ip1) && + xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) { + req->ip1_rtbcount = ip1_blocks; + req->ip2_rtbcount = ip2_blocks; + } else { + req->ip1_bcount = ip1_blocks; + req->ip2_bcount = ip2_blocks; + } + + /* + * Make sure that both forks have enough slack left in their extent + * counters that the exchange operation will not overflow. + */ + trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2); + if (req->ip1 == req->ip2) { + error = xmi_ensure_delta_nextents(req, req->ip1, + d_nexts1 + d_nexts2); + } else { + error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1); + if (error) + goto out_free; + error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2); + } + if (error) + goto out_free; + + trace_xfs_exchmaps_initial_estimate(req); + error = xfs_exchmaps_estimate_overhead(req); +out_free: + kmem_cache_free(xfs_exchmaps_intent_cache, xmi); + return error; +} + +/* Set the reflink flag before an operation. */ +static inline void +xfs_exchmaps_set_reflink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + trace_xfs_reflink_set_inode_flag(ip); + + ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* + * If either file has shared blocks and we're exchanging data forks, we must + * flag the other file as having shared blocks so that we get the shared-block + * rmap functions if we need to fix up the rmaps. + */ +void +xfs_exchmaps_ensure_reflink( + struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi) +{ + unsigned int rs = 0; + + if (xfs_is_reflink_inode(xmi->xmi_ip1)) + rs |= 1; + if (xfs_is_reflink_inode(xmi->xmi_ip2)) + rs |= 2; + + if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2)) + xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2); + + if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1)) + xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1); +} + +/* Set the large extent count flag before an operation if needed. */ +static inline void +xfs_exchmaps_ensure_large_extent_counts( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + if (xfs_inode_has_large_extent_counts(ip)) + return; + + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Widen the extent counter fields of both inodes if necessary. */ +void +xfs_exchmaps_upgrade_extent_counts( + struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi) +{ + if (!xfs_has_large_extent_counts(tp->t_mountp)) + return; + + xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1); + xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2); +} + +/* + * Schedule an exchange a range of mappings from one inode to another. + * + * The use of file mapping exchange log intent items ensures the operation can + * be resumed even if the system goes down. The caller must commit the + * transaction to start the work. + * + * The caller must ensure the inodes must be joined to the transaction and + * ILOCKd; they will still be joined to the transaction at exit. + */ +void +xfs_exchange_mappings( + struct xfs_trans *tp, + const struct xfs_exchmaps_req *req) +{ + struct xfs_exchmaps_intent *xmi; + + BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS); + + xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL); + xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL); + ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS)); + if (req->flags & XFS_EXCHMAPS_SET_SIZES) + ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK)); + ASSERT(xfs_has_exchange_range(tp->t_mountp)); + + if (req->blockcount == 0) + return; + + xmi = xfs_exchmaps_init_intent(req); + xfs_exchmaps_defer_add(tp, xmi); + xfs_exchmaps_ensure_reflink(tp, xmi); + xfs_exchmaps_upgrade_extent_counts(tp, xmi); +} diff --git a/fs/xfs/libxfs/xfs_exchmaps.h b/fs/xfs/libxfs/xfs_exchmaps.h new file mode 100644 index 000000000000..fa822dff202a --- /dev/null +++ b/fs/xfs/libxfs/xfs_exchmaps.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_EXCHMAPS_H__ +#define __XFS_EXCHMAPS_H__ + +/* In-core deferred operation info about a file mapping exchange request. */ +struct xfs_exchmaps_intent { + /* List of other incore deferred work. */ + struct list_head xmi_list; + + /* Inodes participating in the operation. */ + struct xfs_inode *xmi_ip1; + struct xfs_inode *xmi_ip2; + + /* File offset range information. */ + xfs_fileoff_t xmi_startoff1; + xfs_fileoff_t xmi_startoff2; + xfs_filblks_t xmi_blockcount; + + /* Set these file sizes after the operation, unless negative. */ + xfs_fsize_t xmi_isize1; + xfs_fsize_t xmi_isize2; + + uint64_t xmi_flags; /* XFS_EXCHMAPS_* flags */ +}; + +/* Try to convert inode2 from block to short format at the end, if possible. */ +#define __XFS_EXCHMAPS_INO2_SHORTFORM (1ULL << 63) + +#define XFS_EXCHMAPS_INTERNAL_FLAGS (__XFS_EXCHMAPS_INO2_SHORTFORM) + +/* flags that can be passed to xfs_exchmaps_{estimate,mappings} */ +#define XFS_EXCHMAPS_PARAMS (XFS_EXCHMAPS_ATTR_FORK | \ + XFS_EXCHMAPS_SET_SIZES | \ + XFS_EXCHMAPS_INO1_WRITTEN) + +static inline int +xfs_exchmaps_whichfork(const struct xfs_exchmaps_intent *xmi) +{ + if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; +} + +/* Parameters for a mapping exchange request. */ +struct xfs_exchmaps_req { + /* Inodes participating in the operation. */ + struct xfs_inode *ip1; + struct xfs_inode *ip2; + + /* File offset range information. */ + xfs_fileoff_t startoff1; + xfs_fileoff_t startoff2; + xfs_filblks_t blockcount; + + /* XFS_EXCHMAPS_* operation flags */ + uint64_t flags; + + /* + * Fields below this line are filled out by xfs_exchmaps_estimate; + * callers should initialize this part of the struct to zero. + */ + + /* + * Data device blocks to be moved out of ip1, and free space needed to + * handle the bmbt changes. + */ + xfs_filblks_t ip1_bcount; + + /* + * Data device blocks to be moved out of ip2, and free space needed to + * handle the bmbt changes. + */ + xfs_filblks_t ip2_bcount; + + /* rt blocks to be moved out of ip1. */ + xfs_filblks_t ip1_rtbcount; + + /* rt blocks to be moved out of ip2. */ + xfs_filblks_t ip2_rtbcount; + + /* Free space needed to handle the bmbt changes */ + unsigned long long resblks; + + /* Number of exchanges needed to complete the operation */ + unsigned long long nr_exchanges; +}; + +static inline int +xfs_exchmaps_reqfork(const struct xfs_exchmaps_req *req) +{ + if (req->flags & XFS_EXCHMAPS_ATTR_FORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; +} + +int xfs_exchmaps_estimate_overhead(struct xfs_exchmaps_req *req); +int xfs_exchmaps_estimate(struct xfs_exchmaps_req *req); + +extern struct kmem_cache *xfs_exchmaps_intent_cache; + +int __init xfs_exchmaps_intent_init_cache(void); +void xfs_exchmaps_intent_destroy_cache(void); + +struct xfs_exchmaps_intent *xfs_exchmaps_init_intent( + const struct xfs_exchmaps_req *req); +void xfs_exchmaps_ensure_reflink(struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi); +void xfs_exchmaps_upgrade_extent_counts(struct xfs_trans *tp, + const struct xfs_exchmaps_intent *xmi); + +int xfs_exchmaps_finish_one(struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi); + +int xfs_exchmaps_check_forks(struct xfs_mount *mp, + const struct xfs_exchmaps_req *req); + +void xfs_exchange_mappings(struct xfs_trans *tp, + const struct xfs_exchmaps_req *req); + +#endif /* __XFS_EXCHMAPS_H__ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 2b2f9050fbfb..61f51becff4f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -367,19 +367,23 @@ xfs_sb_has_ro_compat_feature( return (sbp->sb_features_ro_compat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ -#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ -#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ -#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ -#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ -#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ +#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ +#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ +#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ +#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ +#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ +#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ #define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE| \ - XFS_SB_FEAT_INCOMPAT_SPINODES| \ - XFS_SB_FEAT_INCOMPAT_META_UUID| \ - XFS_SB_FEAT_INCOMPAT_BIGTIME| \ - XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \ - XFS_SB_FEAT_INCOMPAT_NREXT64) + (XFS_SB_FEAT_INCOMPAT_FTYPE | \ + XFS_SB_FEAT_INCOMPAT_SPINODES | \ + XFS_SB_FEAT_INCOMPAT_META_UUID | \ + XFS_SB_FEAT_INCOMPAT_BIGTIME | \ + XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \ + XFS_SB_FEAT_INCOMPAT_NREXT64 | \ + XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ + XFS_SB_FEAT_INCOMPAT_PARENT) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -898,6 +902,12 @@ static inline uint xfs_dinode_size(int version) #define XFS_MAXLINK ((1U << 31) - 1U) /* + * Any file that hits the maximum ondisk link count should be pinned to avoid + * a use-after-free situation. + */ +#define XFS_NLINK_PINNED (~0U) + +/* * Values for di_format * * This enum is used in string mapping in xfs_trace.h; please keep the diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index ca1b17d01437..97996cb79aaa 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -239,6 +239,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ #define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ #define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ +#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ +#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ /* * Minimum and maximum sizes need for growth checks. @@ -409,6 +411,7 @@ struct xfs_bulkstat { #define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */ #define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */ #define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */ +#define XFS_BS_SICK_DIRTREE (1 << 8) /* directory tree structure */ /* * Project quota id helpers (previously projid was 16bit only @@ -632,7 +635,9 @@ typedef struct xfs_fsop_attrmulti_handlereq { /* * per machine unique filesystem identifier types. */ -typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */ +typedef struct xfs_fsid { + __u32 val[2]; /* file system id type */ +} xfs_fsid_t; typedef struct xfs_fid { __u16 fid_len; /* length of remainder */ @@ -715,9 +720,19 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */ #define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */ #define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */ +#define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 28 +#define XFS_SCRUB_TYPE_NR 29 + +/* + * This special type code only applies to the vectored scrub implementation. + * + * If any of the previous scrub vectors recorded runtime errors or have + * sv_flags bits set that match the OFLAG bits in the barrier vector's + * sv_flags, set the barrier's sv_ret to -ECANCELED and return to userspace. + */ +#define XFS_SCRUB_TYPE_BARRIER (0xFFFFFFFF) /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1u << 0) @@ -763,6 +778,29 @@ struct xfs_scrub_metadata { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED) #define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT) +/* Vectored scrub calls to reduce the number of kernel transitions. */ + +struct xfs_scrub_vec { + __u32 sv_type; /* XFS_SCRUB_TYPE_* */ + __u32 sv_flags; /* XFS_SCRUB_FLAGS_* */ + __s32 sv_ret; /* 0 or a negative error code */ + __u32 sv_reserved; /* must be zero */ +}; + +/* Vectored metadata scrub control structure. */ +struct xfs_scrub_vec_head { + __u64 svh_ino; /* inode number. */ + __u32 svh_gen; /* inode generation. */ + __u32 svh_agno; /* ag number. */ + __u32 svh_flags; /* XFS_SCRUB_VEC_FLAGS_* */ + __u16 svh_rest_us; /* wait this much time between vector items */ + __u16 svh_nr; /* number of svh_vectors */ + __u64 svh_reserved; /* must be zero */ + __u64 svh_vectors; /* pointer to buffer of xfs_scrub_vec */ +}; + +#define XFS_SCRUB_VEC_FLAGS_ALL (0) + /* * ioctl limits */ @@ -772,6 +810,118 @@ struct xfs_scrub_metadata { # define XFS_XATTR_LIST_MAX 65536 #endif +/* + * Exchange part of file1 with part of the file that this ioctl that is being + * called against (which we'll call file2). Filesystems must be able to + * restart and complete the operation even after the system goes down. + */ +struct xfs_exchange_range { + __s32 file1_fd; + __u32 pad; /* must be zeroes */ + __u64 file1_offset; /* file1 offset, bytes */ + __u64 file2_offset; /* file2 offset, bytes */ + __u64 length; /* bytes to exchange */ + + __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ +}; + +/* + * Exchange file data all the way to the ends of both files, and then exchange + * the file sizes. This flag can be used to replace a file's contents with a + * different amount of data. length will be ignored. + */ +#define XFS_EXCHANGE_RANGE_TO_EOF (1ULL << 0) + +/* Flush all changes in file data and file metadata to disk before returning. */ +#define XFS_EXCHANGE_RANGE_DSYNC (1ULL << 1) + +/* Dry run; do all the parameter verification but do not change anything. */ +#define XFS_EXCHANGE_RANGE_DRY_RUN (1ULL << 2) + +/* + * Exchange only the parts of the two files where the file allocation units + * mapped to file1's range have been written to. This can accelerate + * scatter-gather atomic writes with a temp file if all writes are aligned to + * the file allocation unit. + */ +#define XFS_EXCHANGE_RANGE_FILE1_WRITTEN (1ULL << 3) + +#define XFS_EXCHANGE_RANGE_ALL_FLAGS (XFS_EXCHANGE_RANGE_TO_EOF | \ + XFS_EXCHANGE_RANGE_DSYNC | \ + XFS_EXCHANGE_RANGE_DRY_RUN | \ + XFS_EXCHANGE_RANGE_FILE1_WRITTEN) + +/* Iterating parent pointers of files. */ + +/* target was the root directory */ +#define XFS_GETPARENTS_OFLAG_ROOT (1U << 0) + +/* Cursor is done iterating pptrs */ +#define XFS_GETPARENTS_OFLAG_DONE (1U << 1) + +#define XFS_GETPARENTS_OFLAGS_ALL (XFS_GETPARENTS_OFLAG_ROOT | \ + XFS_GETPARENTS_OFLAG_DONE) + +#define XFS_GETPARENTS_IFLAGS_ALL (0) + +struct xfs_getparents_rec { + struct xfs_handle gpr_parent; /* Handle to parent */ + __u32 gpr_reclen; /* Length of entire record */ + __u32 gpr_reserved; /* zero */ + char gpr_name[]; /* Null-terminated filename */ +}; + +/* Iterate through this file's directory parent pointers */ +struct xfs_getparents { + /* + * Structure to track progress in iterating the parent pointers. + * Must be initialized to zeroes before the first ioctl call, and + * not touched by callers after that. + */ + struct xfs_attrlist_cursor gp_cursor; + + /* Input flags: XFS_GETPARENTS_IFLAG* */ + __u16 gp_iflags; + + /* Output flags: XFS_GETPARENTS_OFLAG* */ + __u16 gp_oflags; + + /* Size of the gp_buffer in bytes */ + __u32 gp_bufsize; + + /* Must be set to zero */ + __u64 gp_reserved; + + /* Pointer to a buffer in which to place xfs_getparents_rec */ + __u64 gp_buffer; +}; + +static inline struct xfs_getparents_rec * +xfs_getparents_first_rec(struct xfs_getparents *gp) +{ + return (struct xfs_getparents_rec *)(uintptr_t)gp->gp_buffer; +} + +static inline struct xfs_getparents_rec * +xfs_getparents_next_rec(struct xfs_getparents *gp, + struct xfs_getparents_rec *gpr) +{ + void *next = ((void *)gpr + gpr->gpr_reclen); + void *end = (void *)(uintptr_t)(gp->gp_buffer + gp->gp_bufsize); + + if (next >= end) + return NULL; + + return next; +} + +/* Iterate through this file handle's directory parent pointers. */ +struct xfs_getparents_by_handle { + /* Handle to file whose parents we want. */ + struct xfs_handle gph_handle; + + struct xfs_getparents gph_request; +}; /* * ioctl commands that are used by Linux filesystems @@ -808,6 +958,9 @@ struct xfs_scrub_metadata { /* XFS_IOC_GETFSMAP ------ hoisted 59 */ #define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata) #define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry) +#define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents) +#define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle) +#define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head) /* * ioctl commands that replace IRIX syssgi()'s @@ -843,6 +996,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom) #define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) #define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) +#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 3c64b5f9bd68..b0edb4288e59 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -95,6 +95,7 @@ struct xfs_da_args; /* Don't propagate sick status to ag health summary during inactivation */ #define XFS_SICK_INO_FORGET (1 << 12) +#define XFS_SICK_INO_DIRTREE (1 << 13) /* directory tree structure */ /* Primary evidence of health problems in a given group. */ #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ @@ -125,7 +126,8 @@ struct xfs_da_args; XFS_SICK_INO_DIR | \ XFS_SICK_INO_XATTR | \ XFS_SICK_INO_SYMLINK | \ - XFS_SICK_INO_PARENT) + XFS_SICK_INO_PARENT | \ + XFS_SICK_INO_DIRTREE) #define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \ XFS_SICK_INO_BMBTA_ZAPPED | \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index e5ac3e5430c4..14c81f227c5b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1058,6 +1058,33 @@ xfs_inobt_first_free_inode( } /* + * If this AG has corrupt inodes, check if allocating this inode would fail + * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again + * somewhere else. + */ +static int +xfs_dialloc_check_ino( + struct xfs_perag *pag, + struct xfs_trans *tp, + xfs_ino_t ino) +{ + struct xfs_imap imap; + struct xfs_buf *bp; + int error; + + error = xfs_imap(pag, tp, ino, &imap, 0); + if (error) + return -EAGAIN; + + error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp); + if (error) + return -EAGAIN; + + xfs_trans_brelse(tp, bp); + return 0; +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int @@ -1309,6 +1336,13 @@ alloc_inode: ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + + if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { + error = xfs_dialloc_check_ino(pag, tp, ino); + if (error) + goto error0; + } + rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); @@ -1584,6 +1618,12 @@ xfs_dialloc_ag( XFS_INODES_PER_CHUNK) == 0); ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { + error = xfs_dialloc_check_ino(pag, tp, ino); + if (error) + goto error_cur; + } + /* * Modify or remove the finobt record. */ @@ -1699,7 +1739,7 @@ xfs_dialloc_good_ag( return false; if (!xfs_perag_initialised_agi(pag)) { - error = xfs_ialloc_read_agi(pag, tp, NULL); + error = xfs_ialloc_read_agi(pag, tp, 0, NULL); if (error) return false; } @@ -1768,7 +1808,7 @@ xfs_dialloc_try_ag( * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ - error = xfs_ialloc_read_agi(pag, *tpp, &agbp); + error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp); if (error) return error; @@ -2286,7 +2326,7 @@ xfs_difree( /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2332,7 +2372,7 @@ xfs_imap_lookup( int error; int i; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", @@ -2675,6 +2715,7 @@ int xfs_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, + xfs_buf_flags_t flags, struct xfs_buf **agibpp) { struct xfs_mount *mp = pag->pag_mount; @@ -2684,7 +2725,7 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); + XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); if (error) @@ -2704,6 +2745,7 @@ int xfs_ialloc_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, + int flags, struct xfs_buf **agibpp) { struct xfs_buf *agibp; @@ -2712,7 +2754,9 @@ xfs_ialloc_read_agi( trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, + (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, + &agibp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index f1412183bb44..b549627e3a61 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -63,10 +63,11 @@ xfs_ialloc_log_agi( struct xfs_buf *bp, /* allocation group header buffer */ uint32_t fields); /* bitmask of fields to log */ -int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, +int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, xfs_buf_flags_t flags, struct xfs_buf **agibpp); int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf **agibpp); + int flags, struct xfs_buf **agibpp); +#define XFS_IALLOC_FLAG_TRYLOCK (1U << 0) /* use trylock for buffer locking */ /* * Lookup a record by ino in the btree given by cur. diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index cc661fca6ff5..42e9fd47f6c7 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -745,7 +745,7 @@ xfs_finobt_count_blocks( struct xfs_btree_cur *cur; int error; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) return error; @@ -768,7 +768,7 @@ xfs_finobt_read_blocks( struct xfs_agi *agi; int error; - error = xfs_ialloc_read_agi(pag, tp, &agbp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d0dcce462bf4..d79002343d0b 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -491,6 +491,14 @@ xfs_dinode_verify( return __this_address; } + if (dip->di_version > 1) { + if (dip->di_onlink) + return __this_address; + } else { + if (dip->di_nlink) + return __this_address; + } + /* don't allow invalid i_size */ di_size = be64_to_cpu(dip->di_size); if (di_size & (1ULL << 63)) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 7d660a973909..9d11ae015909 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -765,53 +765,46 @@ xfs_ifork_verify_local_attr( return 0; } +/* + * Check if the inode fork supports adding nr_to_add more extents. + * + * If it doesn't but we can upgrade it to large extent counters, do the upgrade. + * If we can't upgrade or are already using big counters but still can't fit the + * additional extents, return -EFBIG. + */ int -xfs_iext_count_may_overflow( +xfs_iext_count_extend( + struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, - int nr_to_add) + uint nr_to_add) { + struct xfs_mount *mp = ip->i_mount; + bool has_large = + xfs_inode_has_large_extent_counts(ip); struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - uint64_t max_exts; uint64_t nr_exts; + ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); + if (whichfork == XFS_COW_FORK) return 0; - max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip), - whichfork); - - if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) - max_exts = 10; - + /* no point in upgrading if if_nextents overflows */ nr_exts = ifp->if_nextents + nr_to_add; - if (nr_exts < ifp->if_nextents || nr_exts > max_exts) + if (nr_exts < ifp->if_nextents) return -EFBIG; - return 0; -} - -/* - * Upgrade this inode's extent counter fields to be able to handle a potential - * increase in the extent count by nr_to_add. Normally this is the same - * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG. - */ -int -xfs_iext_count_upgrade( - struct xfs_trans *tp, - struct xfs_inode *ip, - uint nr_to_add) -{ - ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); - - if (!xfs_has_large_extent_counts(ip->i_mount) || - xfs_inode_has_large_extent_counts(ip) || - XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + nr_exts > 10) return -EFBIG; - ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - + if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) { + if (has_large || !xfs_has_large_extent_counts(mp)) + return -EFBIG; + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } return 0; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index bd53eb951b65..2373d12fd474 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -256,10 +256,8 @@ extern void xfs_ifork_init_cow(struct xfs_inode *ip); int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); -int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, - int nr_to_add); -int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, - uint nr_to_add); +int xfs_iext_count_extend(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, uint nr_to_add); bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork); /* returns true if the fork has extents but they are not read in yet. */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 16872972e1e9..3e6682ed656b 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -115,10 +115,13 @@ struct xfs_unmount_log_format { #define XLOG_REG_TYPE_BUD_FORMAT 26 #define XLOG_REG_TYPE_ATTRI_FORMAT 27 #define XLOG_REG_TYPE_ATTRD_FORMAT 28 -#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_NAME 29 #define XLOG_REG_TYPE_ATTR_VALUE 30 -#define XLOG_REG_TYPE_MAX 30 - +#define XLOG_REG_TYPE_XMI_FORMAT 31 +#define XLOG_REG_TYPE_XMD_FORMAT 32 +#define XLOG_REG_TYPE_ATTR_NEWNAME 33 +#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 +#define XLOG_REG_TYPE_MAX 34 /* * Flags to log operation header @@ -243,6 +246,8 @@ typedef struct xfs_trans_header { #define XFS_LI_BUD 0x1245 #define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/ #define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ +#define XFS_LI_XMI 0x1248 /* mapping exchange intent */ +#define XFS_LI_XMD 0x1249 /* mapping exchange done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -260,7 +265,9 @@ typedef struct xfs_trans_header { { XFS_LI_BUI, "XFS_LI_BUI" }, \ { XFS_LI_BUD, "XFS_LI_BUD" }, \ { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ - { XFS_LI_ATTRD, "XFS_LI_ATTRD" } + { XFS_LI_ATTRD, "XFS_LI_ATTRD" }, \ + { XFS_LI_XMI, "XFS_LI_XMI" }, \ + { XFS_LI_XMD, "XFS_LI_XMD" } /* * Inode Log Item Format definitions. @@ -879,6 +886,61 @@ struct xfs_bud_log_format { }; /* + * XMI/XMD (file mapping exchange) log format definitions + */ + +/* This is the structure used to lay out an mapping exchange log item. */ +struct xfs_xmi_log_format { + uint16_t xmi_type; /* xmi log item type */ + uint16_t xmi_size; /* size of this item */ + uint32_t __pad; /* must be zero */ + uint64_t xmi_id; /* xmi identifier */ + + uint64_t xmi_inode1; /* inumber of first file */ + uint64_t xmi_inode2; /* inumber of second file */ + uint32_t xmi_igen1; /* generation of first file */ + uint32_t xmi_igen2; /* generation of second file */ + uint64_t xmi_startoff1; /* block offset into file1 */ + uint64_t xmi_startoff2; /* block offset into file2 */ + uint64_t xmi_blockcount; /* number of blocks */ + uint64_t xmi_flags; /* XFS_EXCHMAPS_* */ + uint64_t xmi_isize1; /* intended file1 size */ + uint64_t xmi_isize2; /* intended file2 size */ +}; + +/* Exchange mappings between extended attribute forks instead of data forks. */ +#define XFS_EXCHMAPS_ATTR_FORK (1ULL << 0) + +/* Set the file sizes when finished. */ +#define XFS_EXCHMAPS_SET_SIZES (1ULL << 1) + +/* + * Exchange the mappings of the two files only if the file allocation units + * mapped to file1's range have been written. + */ +#define XFS_EXCHMAPS_INO1_WRITTEN (1ULL << 2) + +/* Clear the reflink flag from inode1 after the operation. */ +#define XFS_EXCHMAPS_CLEAR_INO1_REFLINK (1ULL << 3) + +/* Clear the reflink flag from inode2 after the operation. */ +#define XFS_EXCHMAPS_CLEAR_INO2_REFLINK (1ULL << 4) + +#define XFS_EXCHMAPS_LOGGED_FLAGS (XFS_EXCHMAPS_ATTR_FORK | \ + XFS_EXCHMAPS_SET_SIZES | \ + XFS_EXCHMAPS_INO1_WRITTEN | \ + XFS_EXCHMAPS_CLEAR_INO1_REFLINK | \ + XFS_EXCHMAPS_CLEAR_INO2_REFLINK) + +/* This is the structure used to lay out an mapping exchange done log item. */ +struct xfs_xmd_log_format { + uint16_t xmd_type; /* xmd log item type */ + uint16_t xmd_size; /* size of this item */ + uint32_t __pad; + uint64_t xmd_xmi_id; /* id of corresponding xmi */ +}; + +/* * Dquot Log format definitions. * * The first two fields must be the type and size fitting into @@ -966,6 +1028,9 @@ struct xfs_icreate_log { #define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ #define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ #define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_PPTR_SET 4 /* Set parent pointer */ +#define XFS_ATTRI_OP_FLAGS_PPTR_REMOVE 5 /* Remove parent pointer */ +#define XFS_ATTRI_OP_FLAGS_PPTR_REPLACE 6 /* Replace parent pointer */ #define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ /* @@ -974,6 +1039,7 @@ struct xfs_icreate_log { */ #define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ XFS_ATTR_SECURE | \ + XFS_ATTR_PARENT | \ XFS_ATTR_INCOMPLETE) /* @@ -983,11 +1049,22 @@ struct xfs_icreate_log { struct xfs_attri_log_format { uint16_t alfi_type; /* attri log item type */ uint16_t alfi_size; /* size of this item */ - uint32_t __pad; /* pad to 64 bit aligned */ + uint32_t alfi_igen; /* generation of alfi_ino for pptr ops */ uint64_t alfi_id; /* attri identifier */ uint64_t alfi_ino; /* the inode for this attr operation */ uint32_t alfi_op_flags; /* marks the op as a set or remove */ - uint32_t alfi_name_len; /* attr name length */ + union { + uint32_t alfi_name_len; /* attr name length */ + struct { + /* + * For PPTR_REPLACE, these are the lengths of the old + * and new attr names. The new and old values must + * have the same length. + */ + uint16_t alfi_old_name_len; + uint16_t alfi_new_name_len; + }; + }; uint32_t alfi_value_len; /* attr value length */ uint32_t alfi_attr_filter;/* attr filter flags */ }; diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 9fe7a9564bca..521d327e4c89 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -75,6 +75,8 @@ extern const struct xlog_recover_item_ops xlog_cui_item_ops; extern const struct xlog_recover_item_ops xlog_cud_item_ops; extern const struct xlog_recover_item_ops xlog_attri_item_ops; extern const struct xlog_recover_item_ops xlog_attrd_item_ops; +extern const struct xlog_recover_item_ops xlog_xmi_item_ops; +extern const struct xlog_recover_item_ops xlog_xmd_item_ops; /* * Macros, structures, prototypes for internal log manager use. @@ -121,6 +123,8 @@ bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); +int xlog_recover_iget_handle(struct xfs_mount *mp, xfs_ino_t ino, uint32_t gen, + struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); int xlog_alloc_buf_cancel_table(struct xlog *log); diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 9975b93a7412..d3bd6a86c8fe 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -17,6 +17,34 @@ #include "xfs_trace.h" /* + * Shortly after enabling the large extents count feature in 2023, longstanding + * bugs were found in the code that computes the minimum log size. Luckily, + * the bugs resulted in over-estimates of that size, so there's no impact to + * existing users. However, we don't want to reduce the minimum log size + * because that can create the situation where a newer mkfs writes a new + * filesystem that an older kernel won't mount. + * + * Several years prior, we also discovered that the transaction reservations + * for rmap and reflink operations were unnecessarily large. That was fixed, + * but the minimum log size computation was left alone to avoid the + * compatibility problems noted above. Fix that too. + * + * Therefore, we only may correct the computation starting with filesystem + * features that didn't exist in 2023. In other words, only turn this on if + * the filesystem has parent pointers. + * + * This function can be called before the XFS_HAS_* flags have been set up, + * (e.g. mkfs) so we must check the ondisk superblock. + */ +static inline bool +xfs_want_minlogsize_fixes( + struct xfs_sb *sb) +{ + return xfs_sb_is_v5(sb) && + xfs_sb_has_incompat_feature(sb, XFS_SB_FEAT_INCOMPAT_PARENT); +} + +/* * Calculate the maximum length in bytes that would be required for a local * attribute value as large attributes out of line are not logged. */ @@ -31,6 +59,15 @@ xfs_log_calc_max_attrsetm_res( MAXNAMELEN - 1; nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); nblks += XFS_B_TO_FSB(mp, size); + + /* + * If the feature set is new enough, correct a unit conversion error in + * the xattr transaction reservation code that resulted in oversized + * minimum log size computations. + */ + if (xfs_want_minlogsize_fixes(&mp->m_sb)) + size = XFS_B_TO_FSB(mp, size); + nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); return M_RES(mp)->tr_attrsetm.tr_logres + @@ -49,6 +86,15 @@ xfs_log_calc_trans_resv_for_minlogblocks( unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; /* + * If the feature set is new enough, drop the oversized minimum log + * size computation introduced by the original reflink code. + */ + if (xfs_want_minlogsize_fixes(&mp->m_sb)) { + xfs_trans_resv_calc(mp, resv); + return; + } + + /* * In the early days of rmap+reflink, we always set the rmap maxlevels * to 9 even if the AG was small enough that it would never grow to * that height. Transaction reservation sizes influence the minimum diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 81885a6a028e..e8cdd77d03fa 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -119,6 +119,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); + XFS_CHECK_STRUCT_SIZE(struct xfs_parent_rec, 12); /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); @@ -155,6 +156,11 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16); + /* parent pointer ioctls */ + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_rec, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents, 40); + XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_by_handle, 64); + /* * The v5 superblock format extended several v4 header structures with * additional data. While new fields are only accessible on v5 diff --git a/fs/xfs/libxfs/xfs_parent.c b/fs/xfs/libxfs/xfs_parent.c new file mode 100644 index 000000000000..69366c44a701 --- /dev/null +++ b/fs/xfs/libxfs/xfs_parent.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022-2024 Oracle. + * All rights reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_da_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr_sf.h" +#include "xfs_bmap.h" +#include "xfs_defer.h" +#include "xfs_log.h" +#include "xfs_xattr.h" +#include "xfs_parent.h" +#include "xfs_trans_space.h" +#include "xfs_attr_item.h" +#include "xfs_health.h" + +struct kmem_cache *xfs_parent_args_cache; + +/* + * Parent pointer attribute handling. + * + * Because the attribute name is a filename component, it will never be longer + * than 255 bytes and must not contain nulls or slashes. These are roughly the + * same constraints that apply to attribute names. + * + * The attribute value must always be a struct xfs_parent_rec. This means the + * attribute will never be in remote format because 12 bytes is nowhere near + * xfs_attr_leaf_entsize_local_max() (~75% of block size). + * + * Creating a new parent attribute will always create a new attribute - there + * should never, ever be an existing attribute in the tree for a new inode. + * ENOSPC behavior is problematic - creating the inode without the parent + * pointer is effectively a corruption, so we allow parent attribute creation + * to dip into the reserve block pool to avoid unexpected ENOSPC errors from + * occurring. + */ + +/* Return true if parent pointer attr name is valid. */ +bool +xfs_parent_namecheck( + unsigned int attr_flags, + const void *name, + size_t length) +{ + /* + * Parent pointers always use logged operations, so there should never + * be incomplete xattrs. + */ + if (attr_flags & XFS_ATTR_INCOMPLETE) + return false; + + return xfs_dir2_namecheck(name, length); +} + +/* Return true if parent pointer attr value is valid. */ +bool +xfs_parent_valuecheck( + struct xfs_mount *mp, + const void *value, + size_t valuelen) +{ + const struct xfs_parent_rec *rec = value; + + if (!xfs_has_parent(mp)) + return false; + + /* The xattr value must be a parent record. */ + if (valuelen != sizeof(struct xfs_parent_rec)) + return false; + + /* The parent record must be local. */ + if (value == NULL) + return false; + + /* The parent inumber must be valid. */ + if (!xfs_verify_dir_ino(mp, be64_to_cpu(rec->p_ino))) + return false; + + return true; +} + +/* Compute the attribute name hash for a parent pointer. */ +xfs_dahash_t +xfs_parent_hashval( + struct xfs_mount *mp, + const uint8_t *name, + int namelen, + xfs_ino_t parent_ino) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + + /* + * Use the same dirent name hash as would be used on the directory, but + * mix in the parent inode number to avoid collisions on hardlinked + * files with identical names but different parents. + */ + return xfs_dir2_hashname(mp, &xname) ^ + upper_32_bits(parent_ino) ^ lower_32_bits(parent_ino); +} + +/* Compute the attribute name hash from the xattr components. */ +xfs_dahash_t +xfs_parent_hashattr( + struct xfs_mount *mp, + const uint8_t *name, + int namelen, + const void *value, + int valuelen) +{ + const struct xfs_parent_rec *rec = value; + + /* Requires a local attr value in xfs_parent_rec format */ + if (valuelen != sizeof(struct xfs_parent_rec)) { + ASSERT(valuelen == sizeof(struct xfs_parent_rec)); + return 0; + } + + if (!value) { + ASSERT(value != NULL); + return 0; + } + + return xfs_parent_hashval(mp, name, namelen, be64_to_cpu(rec->p_ino)); +} + +/* + * Initialize the parent pointer arguments structure. Caller must have zeroed + * the contents of @args. @tp is only required for updates. + */ +static void +xfs_parent_da_args_init( + struct xfs_da_args *args, + struct xfs_trans *tp, + struct xfs_parent_rec *rec, + struct xfs_inode *child, + xfs_ino_t owner, + const struct xfs_name *parent_name) +{ + args->geo = child->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->attr_filter = XFS_ATTR_PARENT; + args->op_flags = XFS_DA_OP_LOGGED | XFS_DA_OP_OKNOENT; + args->trans = tp; + args->dp = child; + args->owner = owner; + args->name = parent_name->name; + args->namelen = parent_name->len; + args->value = rec; + args->valuelen = sizeof(struct xfs_parent_rec); + xfs_attr_sethash(args); +} + +/* Make sure the incore state is ready for a parent pointer query/update. */ +static inline int +xfs_parent_iread_extents( + struct xfs_trans *tp, + struct xfs_inode *child) +{ + /* Parent pointers require that the attr fork must exist. */ + if (XFS_IS_CORRUPT(child->i_mount, !xfs_inode_has_attr_fork(child))) { + xfs_inode_mark_sick(child, XFS_SICK_INO_PARENT); + return -EFSCORRUPTED; + } + + return xfs_iread_extents(tp, child, XFS_ATTR_FORK); +} + +/* Add a parent pointer to reflect a dirent addition. */ +int +xfs_parent_addname( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *dp, + const struct xfs_name *parent_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, parent_name); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_SET); + return 0; +} + +/* Remove a parent pointer to reflect a dirent removal. */ +int +xfs_parent_removename( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *dp, + const struct xfs_name *parent_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, parent_name); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REMOVE); + return 0; +} + +/* Replace one parent pointer with another to reflect a rename. */ +int +xfs_parent_replacename( + struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *old_dp, + const struct xfs_name *old_name, + struct xfs_inode *new_dp, + const struct xfs_name *new_name, + struct xfs_inode *child) +{ + int error; + + error = xfs_parent_iread_extents(tp, child); + if (error) + return error; + + xfs_inode_to_parent_rec(&ppargs->rec, old_dp); + xfs_parent_da_args_init(&ppargs->args, tp, &ppargs->rec, child, + child->i_ino, old_name); + + xfs_inode_to_parent_rec(&ppargs->new_rec, new_dp); + ppargs->args.new_name = new_name->name; + ppargs->args.new_namelen = new_name->len; + ppargs->args.new_value = &ppargs->new_rec; + ppargs->args.new_valuelen = sizeof(struct xfs_parent_rec); + xfs_attr_defer_add(&ppargs->args, XFS_ATTR_DEFER_REPLACE); + return 0; +} + +/* + * Extract parent pointer information from any parent pointer xattr into + * @parent_ino/gen. The last two parameters can be NULL pointers. + * + * Returns 0 if this is not a parent pointer xattr at all; or -EFSCORRUPTED for + * garbage. + */ +int +xfs_parent_from_attr( + struct xfs_mount *mp, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + xfs_ino_t *parent_ino, + uint32_t *parent_gen) +{ + const struct xfs_parent_rec *rec = value; + + ASSERT(attr_flags & XFS_ATTR_PARENT); + + if (!xfs_parent_namecheck(attr_flags, name, namelen)) + return -EFSCORRUPTED; + if (!xfs_parent_valuecheck(mp, value, valuelen)) + return -EFSCORRUPTED; + + if (parent_ino) + *parent_ino = be64_to_cpu(rec->p_ino); + if (parent_gen) + *parent_gen = be32_to_cpu(rec->p_gen); + return 0; +} + +/* + * Look up a parent pointer record (@parent_name -> @pptr) of @ip. + * + * Caller must hold at least ILOCK_SHARED. The scratchpad need not be + * initialized. + * + * Returns 0 if the pointer is found, -ENOATTR if there is no match, or a + * negative errno. + */ +int +xfs_parent_lookup( + struct xfs_trans *tp, + struct xfs_inode *ip, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, tp, pptr, ip, ip->i_ino, parent_name); + return xfs_attr_get_ilocked(scratch); +} + +/* Sanity-check a parent pointer before we try to perform repairs. */ +static inline bool +xfs_parent_sanity_check( + struct xfs_mount *mp, + const struct xfs_name *parent_name, + const struct xfs_parent_rec *pptr) +{ + if (!xfs_parent_namecheck(XFS_ATTR_PARENT, parent_name->name, + parent_name->len)) + return false; + + if (!xfs_parent_valuecheck(mp, pptr, sizeof(*pptr))) + return false; + + return true; +} + + +/* + * Attach the parent pointer (@parent_name -> @pptr) to @ip immediately. + * Caller must not have a transaction or hold the ILOCK. This is for + * specialized repair functions only. The scratchpad need not be initialized. + */ +int +xfs_parent_set( + struct xfs_inode *ip, + xfs_ino_t owner, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name); + return xfs_attr_set(scratch, XFS_ATTRUPDATE_CREATE, false); +} + +/* + * Remove the parent pointer (@parent_name -> @pptr) from @ip immediately. + * Caller must not have a transaction or hold the ILOCK. This is for + * specialized repair functions only. The scratchpad need not be initialized. + */ +int +xfs_parent_unset( + struct xfs_inode *ip, + xfs_ino_t owner, + const struct xfs_name *parent_name, + struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch) +{ + if (!xfs_parent_sanity_check(ip->i_mount, parent_name, pptr)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + memset(scratch, 0, sizeof(struct xfs_da_args)); + xfs_parent_da_args_init(scratch, NULL, pptr, ip, owner, parent_name); + return xfs_attr_set(scratch, XFS_ATTRUPDATE_REMOVE, false); +} diff --git a/fs/xfs/libxfs/xfs_parent.h b/fs/xfs/libxfs/xfs_parent.h new file mode 100644 index 000000000000..b8036527cdc7 --- /dev/null +++ b/fs/xfs/libxfs/xfs_parent.h @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022-2024 Oracle. + * All Rights Reserved. + */ +#ifndef __XFS_PARENT_H__ +#define __XFS_PARENT_H__ + +/* Metadata validators */ +bool xfs_parent_namecheck(unsigned int attr_flags, const void *name, + size_t length); +bool xfs_parent_valuecheck(struct xfs_mount *mp, const void *value, + size_t valuelen); + +xfs_dahash_t xfs_parent_hashval(struct xfs_mount *mp, const uint8_t *name, + int namelen, xfs_ino_t parent_ino); +xfs_dahash_t xfs_parent_hashattr(struct xfs_mount *mp, const uint8_t *name, + int namelen, const void *value, int valuelen); + +/* Initializes a xfs_parent_rec to be stored as an attribute name. */ +static inline void +xfs_parent_rec_init( + struct xfs_parent_rec *rec, + xfs_ino_t ino, + uint32_t gen) +{ + rec->p_ino = cpu_to_be64(ino); + rec->p_gen = cpu_to_be32(gen); +} + +/* Initializes a xfs_parent_rec to be stored as an attribute name. */ +static inline void +xfs_inode_to_parent_rec( + struct xfs_parent_rec *rec, + const struct xfs_inode *dp) +{ + xfs_parent_rec_init(rec, dp->i_ino, VFS_IC(dp)->i_generation); +} + +extern struct kmem_cache *xfs_parent_args_cache; + +/* + * Parent pointer information needed to pass around the deferred xattr update + * machinery. + */ +struct xfs_parent_args { + struct xfs_parent_rec rec; + struct xfs_parent_rec new_rec; + struct xfs_da_args args; +}; + +/* + * Start a parent pointer update by allocating the context object we need to + * perform a parent pointer update. + */ +static inline int +xfs_parent_start( + struct xfs_mount *mp, + struct xfs_parent_args **ppargsp) +{ + if (!xfs_has_parent(mp)) { + *ppargsp = NULL; + return 0; + } + + *ppargsp = kmem_cache_zalloc(xfs_parent_args_cache, GFP_KERNEL); + if (!*ppargsp) + return -ENOMEM; + return 0; +} + +/* Finish a parent pointer update by freeing the context object. */ +static inline void +xfs_parent_finish( + struct xfs_mount *mp, + struct xfs_parent_args *ppargs) +{ + if (ppargs) + kmem_cache_free(xfs_parent_args_cache, ppargs); +} + +int xfs_parent_addname(struct xfs_trans *tp, struct xfs_parent_args *ppargs, + struct xfs_inode *dp, const struct xfs_name *parent_name, + struct xfs_inode *child); +int xfs_parent_removename(struct xfs_trans *tp, struct xfs_parent_args *ppargs, + struct xfs_inode *dp, const struct xfs_name *parent_name, + struct xfs_inode *child); +int xfs_parent_replacename(struct xfs_trans *tp, + struct xfs_parent_args *ppargs, + struct xfs_inode *old_dp, const struct xfs_name *old_name, + struct xfs_inode *new_dp, const struct xfs_name *new_name, + struct xfs_inode *child); + +int xfs_parent_from_attr(struct xfs_mount *mp, unsigned int attr_flags, + const unsigned char *name, unsigned int namelen, + const void *value, unsigned int valuelen, + xfs_ino_t *parent_ino, uint32_t *parent_gen); + +/* Repair functions */ +int xfs_parent_lookup(struct xfs_trans *tp, struct xfs_inode *ip, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); +int xfs_parent_set(struct xfs_inode *ip, xfs_ino_t owner, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); +int xfs_parent_unset(struct xfs_inode *ip, xfs_ino_t owner, + const struct xfs_name *name, struct xfs_parent_rec *pptr, + struct xfs_da_args *scratch); + +#endif /* __XFS_PARENT_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index f246d6dbf4ec..386b672c5058 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1168,3 +1168,60 @@ xfs_rtsummary_wordcount( blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks); return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; } + +/* + * Lock both realtime free space metadata inodes for a freespace update. If a + * transaction is given, the inodes will be joined to the transaction and the + * ILOCKs will be released on transaction commit. + */ +void +xfs_rtbitmap_lock( + struct xfs_trans *tp, + struct xfs_mount *mp) +{ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + if (tp) + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + if (tp) + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); +} + +/* Unlock both realtime free space metadata inodes after a freespace update. */ +void +xfs_rtbitmap_unlock( + struct xfs_mount *mp) +{ + xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); +} + +/* + * Lock the realtime free space metadata inodes for a freespace scan. Callers + * must walk metadata blocks in order of increasing file offset. + */ +void +xfs_rtbitmap_lock_shared( + struct xfs_mount *mp, + unsigned int rbmlock_flags) +{ + if (rbmlock_flags & XFS_RBMLOCK_BITMAP) + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + + if (rbmlock_flags & XFS_RBMLOCK_SUMMARY) + xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM); +} + +/* Unlock the realtime free space metadata inodes after a freespace scan. */ +void +xfs_rtbitmap_unlock_shared( + struct xfs_mount *mp, + unsigned int rbmlock_flags) +{ + if (rbmlock_flags & XFS_RBMLOCK_SUMMARY) + xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM); + + if (rbmlock_flags & XFS_RBMLOCK_BITMAP) + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); +} diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 152a66750af5..6186585f2c37 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -360,6 +360,19 @@ xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, unsigned int rsumlevels, xfs_extlen_t rbmblocks); unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, unsigned int rsumlevels, xfs_extlen_t rbmblocks); + +void xfs_rtbitmap_lock(struct xfs_trans *tp, struct xfs_mount *mp); +void xfs_rtbitmap_unlock(struct xfs_mount *mp); + +/* Lock the rt bitmap inode in shared mode */ +#define XFS_RBMLOCK_BITMAP (1U << 0) +/* Lock the rt summary inode in shared mode */ +#define XFS_RBMLOCK_SUMMARY (1U << 1) + +void xfs_rtbitmap_lock_shared(struct xfs_mount *mp, + unsigned int rbmlock_flags); +void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp, + unsigned int rbmlock_flags); #else /* CONFIG_XFS_RT */ # define xfs_rtfree_extent(t,b,l) (-ENOSYS) # define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS) @@ -378,6 +391,10 @@ xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) # define xfs_rtbitmap_wordcount(mp, r) (0) # define xfs_rtsummary_blockcount(mp, l, b) (0) # define xfs_rtsummary_wordcount(mp, l, b) (0) +# define xfs_rtbitmap_lock(tp, mp) do { } while (0) +# define xfs_rtbitmap_unlock(mp) do { } while (0) +# define xfs_rtbitmap_lock_shared(mp, lf) do { } while (0) +# define xfs_rtbitmap_unlock_shared(mp, lf) do { } while (0) #endif /* CONFIG_XFS_RT */ #endif /* __XFS_RTBITMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 73a4b895de67..09e4bf949bf8 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -26,6 +26,7 @@ #include "xfs_health.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_exchrange.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -175,6 +176,10 @@ xfs_sb_version_to_features( features |= XFS_FEAT_NEEDSREPAIR; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) features |= XFS_FEAT_NREXT64; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE) + features |= XFS_FEAT_EXCHANGE_RANGE; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT) + features |= XFS_FEAT_PARENT; return features; } @@ -1251,6 +1256,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; if (xfs_has_inobtcounts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; + if (xfs_has_parent(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_PARENT; if (xfs_has_sector(mp)) { geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; geo->logsectsize = sbp->sb_logsectsize; @@ -1259,6 +1266,8 @@ xfs_fs_geometry( } if (xfs_has_large_extent_counts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; + if (xfs_has_exchange_range(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index dfd61fa8332e..34f104ed372c 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -124,7 +124,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_RES_FDBLKS (1u << 6) /* Transaction contains an intent done log item */ #define XFS_TRANS_HAS_INTENT_DONE (1u << 7) - /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an @@ -136,7 +135,10 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, * for free space from AG 0. If the correct transaction reservations have been * made then this algorithm will eventually find all the space it needs. */ -#define XFS_TRANS_LOWMODE 0x100 /* allocate in low space mode */ +#define XFS_TRANS_LOWMODE (1u << 8) + +/* Transaction has locked the rtbitmap and rtsum inodes */ +#define XFS_TRANS_RTBITMAP_LOCKED (1u << 9) /* * Field values for xfs_trans_mod_sb. diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index ffb1317a9212..f228127a88ff 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -169,7 +169,8 @@ xfs_symlink_local_to_remote( struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, - struct xfs_ifork *ifp) + struct xfs_ifork *ifp, + void *priv) { struct xfs_mount *mp = ip->i_mount; char *buf; @@ -310,6 +311,7 @@ int xfs_symlink_write_target( struct xfs_trans *tp, struct xfs_inode *ip, + xfs_ino_t owner, const char *target_path, int pathlen, xfs_fsblock_t fs_blocks, @@ -364,8 +366,7 @@ xfs_symlink_write_target( byte_cnt = min(byte_cnt, pathlen); buf = bp->b_addr; - buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt, - bp); + buf += xfs_symlink_hdr_set(mp, owner, offset, byte_cnt, bp); memcpy(buf, cur_chunk, byte_cnt); @@ -380,3 +381,50 @@ xfs_symlink_write_target( ASSERT(pathlen == 0); return 0; } + +/* Remove all the blocks from a symlink and invalidate buffers. */ +int +xfs_symlink_remote_truncate( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp; + int nmaps = XFS_SYMLINK_MAPS; + int done = 0; + int i; + int error; + + /* Read mappings and invalidate buffers. */ + error = xfs_bmapi_read(ip, 0, XFS_MAX_FILEOFF, mval, &nmaps, 0); + if (error) + return error; + + for (i = 0; i < nmaps; i++) { + if (!xfs_bmap_is_real_extent(&mval[i])) + break; + + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0, + &bp); + if (error) + return error; + + xfs_trans_binval(tp, bp); + } + + /* Unmap the remote blocks. */ + error = xfs_bunmapi(tp, ip, 0, XFS_MAX_FILEOFF, 0, nmaps, &done); + if (error) + return error; + if (!done) { + ASSERT(done); + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + return -EFSCORRUPTED; + } + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h index a63bd38ae4fa..c1672fe1f17b 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.h +++ b/fs/xfs/libxfs/xfs_symlink_remote.h @@ -16,11 +16,13 @@ int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_inode *ip, struct xfs_ifork *ifp); + struct xfs_inode *ip, struct xfs_ifork *ifp, + void *priv); xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); int xfs_symlink_remote_read(struct xfs_inode *ip, char *link); int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip, - const char *target_path, int pathlen, xfs_fsblock_t fs_blocks, - uint resblks); + xfs_ino_t owner, const char *target_path, int pathlen, + xfs_fsblock_t fs_blocks, uint resblks); +int xfs_symlink_remote_truncate(struct xfs_trans *tp, struct xfs_inode *ip); #endif /* __XFS_SYMLINK_REMOTE_H */ diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6cd45e8c118d..6dbe6e7251e7 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -20,6 +20,9 @@ #include "xfs_qm.h" #include "xfs_trans_space.h" #include "xfs_rtbitmap.h" +#include "xfs_attr_item.h" +#include "xfs_log.h" +#include "xfs_da_format.h" #define _ALLOC true #define _FREE false @@ -422,29 +425,110 @@ xfs_calc_itruncate_reservation_minlogsize( return xfs_calc_itruncate_reservation(mp, true); } +static inline unsigned int xfs_calc_pptr_link_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} +static inline unsigned int xfs_calc_pptr_unlink_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} +static inline unsigned int xfs_calc_pptr_replace_overhead(void) +{ + return sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1) + + xlog_calc_iovec_len(sizeof(struct xfs_parent_rec)) + + xlog_calc_iovec_len(MAXNAMELEN - 1); +} + /* * In renaming a files we can modify: * the five inodes involved: 5 * inode size * the two directory btrees: 2 * (max depth + v2) * dir block size * the two directory bmap btrees: 2 * max depth * block size * And the bmap_finish transaction can free dir and bmap blocks (two sets - * of bmap blocks) giving: + * of bmap blocks) giving (t2): * the agf for the ags in which the blocks live: 3 * sector size * the agfl for the ags in which the blocks live: 3 * sector size * the superblock for the free block count: sector size * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + * If parent pointers are enabled (t3), then each transaction in the chain + * must be capable of setting or removing the extended attribute + * containing the parent information. It must also be able to handle + * the three xattr intent items that track the progress of the parent + * pointer update. */ STATIC uint xfs_calc_rename_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 5) + - xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES(mp); + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + t1 = xfs_calc_inode_res(mp, 5) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1)); + + t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), + XFS_FSB_TO_B(mp, 1)); + + if (xfs_has_parent(mp)) { + unsigned int rename_overhead, exchange_overhead; + + t3 = max(resp->tr_attrsetm.tr_logres, + resp->tr_attrrm.tr_logres); + + /* + * For a standard rename, the three xattr intent log items + * are (1) replacing the pptr for the source file; (2) + * removing the pptr on the dest file; and (3) adding a + * pptr for the whiteout file in the src dir. + * + * For an RENAME_EXCHANGE, there are two xattr intent + * items to replace the pptr for both src and dest + * files. Link counts don't change and there is no + * whiteout. + * + * In the worst case we can end up relogging all log + * intent items to allow the log tail to move ahead, so + * they become overhead added to each transaction in a + * processing chain. + */ + rename_overhead = xfs_calc_pptr_replace_overhead() + + xfs_calc_pptr_unlink_overhead() + + xfs_calc_pptr_link_overhead(); + exchange_overhead = 2 * xfs_calc_pptr_replace_overhead(); + + overhead += max(rename_overhead, exchange_overhead); + } + + return overhead + max3(t1, t2, t3); +} + +static inline unsigned int +xfs_rename_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* One for the rename, one more for freeing blocks */ + unsigned int ret = XFS_RENAME_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to remove or add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += max(resp->tr_attrsetm.tr_logcount, + resp->tr_attrrm.tr_logcount); + + return ret; } /* @@ -461,6 +545,23 @@ xfs_calc_iunlink_remove_reservation( 2 * M_IGEO(mp)->inode_cluster_size; } +static inline unsigned int +xfs_link_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_LINK_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + /* * For creating a link to an inode: * the parent directory inode: inode size @@ -477,14 +578,23 @@ STATIC uint xfs_calc_link_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_iunlink_remove_reservation(mp) + - max((xfs_calc_inode_res(mp, 2) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES(mp); + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + overhead += xfs_calc_iunlink_remove_reservation(mp); + t1 = xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), + XFS_FSB_TO_B(mp, 1)); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrsetm.tr_logres; + overhead += xfs_calc_pptr_link_overhead(); + } + + return overhead + max3(t1, t2, t3); } /* @@ -499,6 +609,23 @@ xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) M_IGEO(mp)->inode_cluster_size; } +static inline unsigned int +xfs_remove_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_REMOVE_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrrm.tr_logcount; + + return ret; +} + /* * For removing a directory entry we can modify: * the parent directory inode: inode size @@ -515,14 +642,24 @@ STATIC uint xfs_calc_remove_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_iunlink_add_reservation(mp) + - max((xfs_calc_inode_res(mp, 2) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + unsigned int overhead = XFS_DQUOT_LOGRES(mp); + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int t1, t2, t3 = 0; + + overhead += xfs_calc_iunlink_add_reservation(mp); + + t1 = xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), + XFS_FSB_TO_B(mp, 1)); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrrm.tr_logres; + overhead += xfs_calc_pptr_unlink_overhead(); + } + + return overhead + max3(t1, t2, t3); } /* @@ -571,12 +708,40 @@ xfs_calc_icreate_resv_alloc( xfs_calc_finobt_res(mp); } +static inline unsigned int +xfs_icreate_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_CREATE_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + STATIC uint -xfs_calc_icreate_reservation(xfs_mount_t *mp) +xfs_calc_icreate_reservation( + struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max(xfs_calc_icreate_resv_alloc(mp), - xfs_calc_create_resv_modify(mp)); + struct xfs_trans_resv *resp = M_RES(mp); + unsigned int overhead = XFS_DQUOT_LOGRES(mp); + unsigned int t1, t2, t3 = 0; + + t1 = xfs_calc_icreate_resv_alloc(mp); + t2 = xfs_calc_create_resv_modify(mp); + + if (xfs_has_parent(mp)) { + t3 = resp->tr_attrsetm.tr_logres; + overhead += xfs_calc_pptr_link_overhead(); + } + + return overhead + max3(t1, t2, t3); } STATIC uint @@ -589,6 +754,23 @@ xfs_calc_create_tmpfile_reservation( return res + xfs_calc_iunlink_add_reservation(mp); } +static inline unsigned int +xfs_mkdir_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_MKDIR_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} + /* * Making a new directory is the same as creating a new file. */ @@ -599,6 +781,22 @@ xfs_calc_mkdir_reservation( return xfs_calc_icreate_reservation(mp); } +static inline unsigned int +xfs_symlink_log_count( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + unsigned int ret = XFS_SYMLINK_LOG_COUNT; + + /* + * Pre-reserve enough log reservation to handle the transaction + * rolling needed to add one parent pointer. + */ + if (xfs_has_parent(mp)) + ret += resp->tr_attrsetm.tr_logcount; + + return ret; +} /* * Making a new symplink is the same as creating a new file, but @@ -911,54 +1109,76 @@ xfs_calc_sb_reservation( return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); } -void -xfs_trans_resv_calc( +/* + * Namespace reservations. + * + * These get tricky when parent pointers are enabled as we have attribute + * modifications occurring from within these transactions. Rather than confuse + * each of these reservation calculations with the conditional attribute + * reservations, add them here in a clear and concise manner. This requires that + * the attribute reservations have already been calculated. + * + * Note that we only include the static attribute reservation here; the runtime + * reservation will have to be modified by the size of the attributes being + * added/removed/modified. See the comments on the attribute reservation + * calculations for more details. + */ +STATIC void +xfs_calc_namespace_reservations( struct xfs_mount *mp, struct xfs_trans_resv *resp) { - int logcount_adj = 0; - - /* - * The following transactions are logged in physical format and - * require a permanent reservation on space. - */ - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; - resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; - resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + ASSERT(resp->tr_attrsetm.tr_logres > 0); resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); - resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT; + resp->tr_rename.tr_logcount = xfs_rename_log_count(mp, resp); resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_link.tr_logres = xfs_calc_link_reservation(mp); - resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT; + resp->tr_link.tr_logcount = xfs_link_log_count(mp, resp); resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp); - resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT; + resp->tr_remove.tr_logcount = xfs_remove_log_count(mp, resp); resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp); - resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; + resp->tr_symlink.tr_logcount = xfs_symlink_log_count(mp, resp); resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_create.tr_logres = xfs_calc_icreate_reservation(mp); - resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; + resp->tr_create.tr_logcount = xfs_icreate_log_count(mp, resp); resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); + resp->tr_mkdir.tr_logcount = xfs_mkdir_log_count(mp, resp); + resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; +} + +void +xfs_trans_resv_calc( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + int logcount_adj = 0; + + /* + * The following transactions are logged in physical format and + * require a permanent reservation on space. + */ + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_create_tmpfile.tr_logres = xfs_calc_create_tmpfile_reservation(mp); resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT; resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); - resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; - resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp); resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT; resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES; @@ -988,6 +1208,8 @@ xfs_trans_resv_calc( resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + xfs_calc_namespace_reservations(mp, resp); + /* * The following transactions are logged in logical format with * a default log count. diff --git a/fs/xfs/libxfs/xfs_trans_space.c b/fs/xfs/libxfs/xfs_trans_space.c new file mode 100644 index 000000000000..b9dc3752f702 --- /dev/null +++ b/fs/xfs/libxfs/xfs_trans_space.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_da_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" + +/* Calculate the disk space required to add a parent pointer. */ +unsigned int +xfs_parent_calc_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + /* + * Parent pointers are always the first attr in an attr tree, and never + * larger than a block + */ + return XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + + XFS_NEXTENTADD_SPACE_RES(mp, namelen, XFS_ATTR_FORK); +} + +unsigned int +xfs_create_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret; + + ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen); + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_mkdir_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + return xfs_create_space_res(mp, namelen); +} + +unsigned int +xfs_link_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret; + + ret = XFS_DIRENTER_SPACE_RES(mp, namelen); + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_symlink_space_res( + struct xfs_mount *mp, + unsigned int namelen, + unsigned int fsblocks) +{ + unsigned int ret; + + ret = XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp, namelen) + + fsblocks; + + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_remove_space_res( + struct xfs_mount *mp, + unsigned int namelen) +{ + unsigned int ret = XFS_DIRREMOVE_SPACE_RES(mp); + + if (xfs_has_parent(mp)) + ret += xfs_parent_calc_space_res(mp, namelen); + + return ret; +} + +unsigned int +xfs_rename_space_res( + struct xfs_mount *mp, + unsigned int src_namelen, + bool target_exists, + unsigned int target_namelen, + bool has_whiteout) +{ + unsigned int ret; + + ret = XFS_DIRREMOVE_SPACE_RES(mp) + + XFS_DIRENTER_SPACE_RES(mp, target_namelen); + + if (xfs_has_parent(mp)) { + if (has_whiteout) + ret += xfs_parent_calc_space_res(mp, src_namelen); + ret += 2 * xfs_parent_calc_space_res(mp, target_namelen); + } + + if (target_exists) + ret += xfs_parent_calc_space_res(mp, target_namelen); + + return ret; +} diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 87b31c69a773..1155ff2d37e2 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -10,6 +10,10 @@ * Components of space reservations. */ +/* Worst case number of bmaps that can be held in a block. */ +#define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp) \ + (((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0])) + /* Worst case number of rmaps that can be held in a block. */ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) @@ -76,31 +80,32 @@ /* This macro is not used - see inline code in xfs_attr_set */ #define XFS_ATTRSET_SPACE_RES(mp, v) \ (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v)) -#define XFS_CREATE_SPACE_RES(mp,nl) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ (2 * (mp)->m_alloc_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) -#define XFS_LINK_SPACE_RES(mp,nl) \ - XFS_DIRENTER_SPACE_RES(mp,nl) -#define XFS_MKDIR_SPACE_RES(mp,nl) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_QM_DQALLOC_SPACE_RES(mp) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \ XFS_DQUOT_CLUSTER_SIZE_FSB) #define XFS_QM_QINOCREATE_SPACE_RES(mp) \ XFS_IALLOC_SPACE_RES(mp) -#define XFS_REMOVE_SPACE_RES(mp) \ - XFS_DIRREMOVE_SPACE_RES(mp) -#define XFS_RENAME_SPACE_RES(mp,nl) \ - (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) -#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ - (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) #define XFS_IFREE_SPACE_RES(mp) \ (xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0) +unsigned int xfs_parent_calc_space_res(struct xfs_mount *mp, + unsigned int namelen); + +unsigned int xfs_create_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_mkdir_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_link_space_res(struct xfs_mount *mp, unsigned int namelen); +unsigned int xfs_symlink_space_res(struct xfs_mount *mp, unsigned int namelen, + unsigned int fsblocks); +unsigned int xfs_remove_space_res(struct xfs_mount *mp, unsigned int namelen); + +unsigned int xfs_rename_space_res(struct xfs_mount *mp, + unsigned int src_namelen, bool target_exists, + unsigned int target_namelen, bool has_whiteout); #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index e954f07679dd..f8e5b67128d2 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -15,6 +15,7 @@ #include "xfs_ialloc.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_inode.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -165,8 +166,7 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); /* Check sb_versionnum bits that are set at mkfs time. */ - vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS | - XFS_SB_VERSION_NUMBITS | + vernum_mask = cpu_to_be16(XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALIGNBIT | XFS_SB_VERSION_DALIGNBIT | XFS_SB_VERSION_SHAREDBIT | @@ -865,6 +865,43 @@ xchk_agi_xref( /* scrub teardown will take care of sc->sa for us */ } +/* + * Check the unlinked buckets for links to bad inodes. We hold the AGI, so + * there cannot be any threads updating unlinked list pointers in this AG. + */ +STATIC void +xchk_iunlink( + struct xfs_scrub *sc, + struct xfs_agi *agi) +{ + unsigned int i; + struct xfs_inode *ip; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + xfs_agino_t agino = be32_to_cpu(agi->agi_unlinked[i]); + + while (agino != NULLAGINO) { + if (agino % XFS_AGI_UNLINKED_BUCKETS != i) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + ip = xfs_iunlink_lookup(sc->sa.pag, agino); + if (!ip) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + if (!xfs_inode_on_unlinked_list(ip)) { + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + return; + } + + agino = ip->i_next_unlinked; + } + } +} + /* Scrub the AGI. */ int xchk_agi( @@ -949,6 +986,8 @@ xchk_agi( if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); + xchk_iunlink(sc, agi); + xchk_agi_xref(sc); out: return error; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 427054b65b23..0dbc484b182f 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -21,13 +21,18 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_ag.h" +#include "xfs_inode.h" +#include "xfs_iunlink_item.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" +#include "scrub/agino_bitmap.h" #include "scrub/reap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" /* Superblock */ @@ -796,15 +801,57 @@ enum { XREP_AGI_MAX }; +#define XREP_AGI_LOOKUP_BATCH 32 + +struct xrep_agi { + struct xfs_scrub *sc; + + /* AGI buffer, tracked separately */ + struct xfs_buf *agi_bp; + + /* context for finding btree roots */ + struct xrep_find_ag_btree fab[XREP_AGI_MAX]; + + /* old AGI contents in case we have to revert */ + struct xfs_agi old_agi; + + /* bitmap of which inodes are unlinked */ + struct xagino_bitmap iunlink_bmp; + + /* heads of the unlinked inode bucket lists */ + xfs_agino_t iunlink_heads[XFS_AGI_UNLINKED_BUCKETS]; + + /* scratchpad for batched lookups of the radix tree */ + struct xfs_inode *lookup_batch[XREP_AGI_LOOKUP_BATCH]; + + /* Map of ino -> next_ino for unlinked inode processing. */ + struct xfarray *iunlink_next; + + /* Map of ino -> prev_ino for unlinked inode processing. */ + struct xfarray *iunlink_prev; +}; + +static void +xrep_agi_buf_cleanup( + void *buf) +{ + struct xrep_agi *ragi = buf; + + xfarray_destroy(ragi->iunlink_prev); + xfarray_destroy(ragi->iunlink_next); + xagino_bitmap_destroy(&ragi->iunlink_bmp); +} + /* * Given the inode btree roots described by *fab, find the roots, check them * for sanity, and pass the root data back out via *fab. */ STATIC int xrep_agi_find_btrees( - struct xfs_scrub *sc, - struct xrep_find_ag_btree *fab) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xrep_find_ag_btree *fab = ragi->fab; struct xfs_buf *agf_bp; struct xfs_mount *mp = sc->mp; int error; @@ -837,10 +884,11 @@ xrep_agi_find_btrees( */ STATIC void xrep_agi_init_header( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp, - struct xfs_agi *old_agi) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; + struct xfs_agi *old_agi = &ragi->old_agi; struct xfs_agi *agi = agi_bp->b_addr; struct xfs_perag *pag = sc->sa.pag; struct xfs_mount *mp = sc->mp; @@ -856,10 +904,6 @@ xrep_agi_init_header( if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); - /* We don't know how to fix the unlinked list yet. */ - memcpy(&agi->agi_unlinked, &old_agi->agi_unlinked, - sizeof(agi->agi_unlinked)); - /* Mark the incore AGF data stale until we're done fixing things. */ ASSERT(xfs_perag_initialised_agi(pag)); clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); @@ -868,10 +912,12 @@ xrep_agi_init_header( /* Set btree root information in an AGI. */ STATIC void xrep_agi_set_roots( - struct xfs_scrub *sc, - struct xfs_agi *agi, - struct xrep_find_ag_btree *fab) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_agi *agi = ragi->agi_bp->b_addr; + struct xrep_find_ag_btree *fab = ragi->fab; + agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root); agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height); @@ -884,9 +930,10 @@ xrep_agi_set_roots( /* Update the AGI counters. */ STATIC int xrep_agi_calc_from_btrees( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; struct xfs_btree_cur *cur; struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; @@ -928,12 +975,721 @@ err: return error; } +/* + * Record a forwards unlinked chain pointer from agino -> next_agino in our + * staging information. + */ +static inline int +xrep_iunlink_store_next( + struct xrep_agi *ragi, + xfs_agino_t agino, + xfs_agino_t next_agino) +{ + ASSERT(next_agino != 0); + + return xfarray_store(ragi->iunlink_next, agino, &next_agino); +} + +/* + * Record a backwards unlinked chain pointer from prev_ino <- agino in our + * staging information. + */ +static inline int +xrep_iunlink_store_prev( + struct xrep_agi *ragi, + xfs_agino_t agino, + xfs_agino_t prev_agino) +{ + ASSERT(prev_agino != 0); + + return xfarray_store(ragi->iunlink_prev, agino, &prev_agino); +} + +/* + * Given an @agino, look up the next inode in the iunlink bucket. Returns + * NULLAGINO if we're at the end of the chain, 0 if @agino is not in memory + * like it should be, or a per-AG inode number. + */ +static inline xfs_agino_t +xrep_iunlink_next( + struct xfs_scrub *sc, + xfs_agino_t agino) +{ + struct xfs_inode *ip; + + ip = xfs_iunlink_lookup(sc->sa.pag, agino); + if (!ip) + return 0; + + return ip->i_next_unlinked; +} + +/* + * Load the inode @agino into memory, set its i_prev_unlinked, and drop the + * inode so it can be inactivated. Returns NULLAGINO if we're at the end of + * the chain or if we should stop walking the chain due to corruption; or a + * per-AG inode number. + */ +STATIC xfs_agino_t +xrep_iunlink_reload_next( + struct xrep_agi *ragi, + xfs_agino_t prev_agino, + xfs_agino_t agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_inode *ip; + xfs_ino_t ino; + xfs_agino_t ret = NULLAGINO; + int error; + + ino = XFS_AGINO_TO_INO(sc->mp, sc->sa.pag->pag_agno, agino); + error = xchk_iget(ragi->sc, ino, &ip); + if (error) + return ret; + + trace_xrep_iunlink_reload_next(ip, prev_agino); + + /* If this is a linked inode, stop processing the chain. */ + if (VFS_I(ip)->i_nlink != 0) { + xrep_iunlink_store_next(ragi, agino, NULLAGINO); + goto rele; + } + + ip->i_prev_unlinked = prev_agino; + ret = ip->i_next_unlinked; + + /* + * Drop the inode reference that we just took. We hold the AGI, so + * this inode cannot move off the unlinked list and hence cannot be + * reclaimed. + */ +rele: + xchk_irele(sc, ip); + return ret; +} + +/* + * Walk an AGI unlinked bucket's list to load incore any unlinked inodes that + * still existed at mount time. This can happen if iunlink processing fails + * during log recovery. + */ +STATIC int +xrep_iunlink_walk_ondisk_bucket( + struct xrep_agi *ragi, + unsigned int bucket) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + xfs_agino_t prev_agino = NULLAGINO; + xfs_agino_t next_agino; + int error = 0; + + next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (next_agino != NULLAGINO) { + xfs_agino_t agino = next_agino; + + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + trace_xrep_iunlink_walk_ondisk_bucket(sc->sa.pag, bucket, + prev_agino, agino); + + if (bucket != agino % XFS_AGI_UNLINKED_BUCKETS) + break; + + next_agino = xrep_iunlink_next(sc, agino); + if (!next_agino) + next_agino = xrep_iunlink_reload_next(ragi, prev_agino, + agino); + + prev_agino = agino; + } + + return 0; +} + +/* Decide if this is an unlinked inode in this AG. */ +STATIC bool +xrep_iunlink_igrab( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = pag->pag_mount; + + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + return false; + + if (!xfs_inode_on_unlinked_list(ip)) + return false; + + return true; +} + +/* + * Mark the given inode in the lookup batch in our unlinked inode bitmap, and + * remember if this inode is the start of the unlinked chain. + */ +STATIC int +xrep_iunlink_visit( + struct xrep_agi *ragi, + unsigned int batch_idx) +{ + struct xfs_mount *mp = ragi->sc->mp; + struct xfs_inode *ip = ragi->lookup_batch[batch_idx]; + xfs_agino_t agino; + unsigned int bucket; + int error; + + ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == ragi->sc->sa.pag->pag_agno); + ASSERT(xfs_inode_on_unlinked_list(ip)); + + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + + trace_xrep_iunlink_visit(ragi->sc->sa.pag, bucket, + ragi->iunlink_heads[bucket], ip); + + error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1); + if (error) + return error; + + if (ip->i_prev_unlinked == NULLAGINO) { + if (ragi->iunlink_heads[bucket] == NULLAGINO) + ragi->iunlink_heads[bucket] = agino; + } + + return 0; +} + +/* + * Find all incore unlinked inodes so that we can rebuild the unlinked buckets. + * We hold the AGI so there should not be any modifications to the unlinked + * list. + */ +STATIC int +xrep_iunlink_mark_incore( + struct xrep_agi *ragi) +{ + struct xfs_perag *pag = ragi->sc->sa.pag; + struct xfs_mount *mp = pag->pag_mount; + uint32_t first_index = 0; + bool done = false; + unsigned int nr_found = 0; + + do { + unsigned int i; + int error = 0; + + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + rcu_read_lock(); + + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)&ragi->lookup_batch, first_index, + XREP_AGI_LOOKUP_BATCH); + if (!nr_found) { + rcu_read_unlock(); + return 0; + } + + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = ragi->lookup_batch[i]; + + if (done || !xrep_iunlink_igrab(pag, ip)) + ragi->lookup_batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = true; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!ragi->lookup_batch[i]) + continue; + error = xrep_iunlink_visit(ragi, i); + if (error) + return error; + } + } while (!done); + + return 0; +} + +/* Mark all the unlinked ondisk inodes in this inobt record in iunlink_bmp. */ +STATIC int +xrep_iunlink_mark_ondisk_rec( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_inobt_rec_incore irec; + struct xrep_agi *ragi = priv; + struct xfs_scrub *sc = ragi->sc; + struct xfs_mount *mp = cur->bc_mp; + xfs_agino_t agino; + unsigned int i; + int error = 0; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + + for (i = 0, agino = irec.ir_startino; + i < XFS_INODES_PER_CHUNK; + i++, agino++) { + struct xfs_inode *ip; + unsigned int len = 1; + + /* Skip free inodes */ + if (XFS_INOBT_MASK(i) & irec.ir_free) + continue; + /* Skip inodes we've seen before */ + if (xagino_bitmap_test(&ragi->iunlink_bmp, agino, &len)) + continue; + + /* + * Skip incore inodes; these were already picked up by + * the _mark_incore step. + */ + rcu_read_lock(); + ip = radix_tree_lookup(&sc->sa.pag->pag_ici_root, agino); + rcu_read_unlock(); + if (ip) + continue; + + /* + * Try to look up this inode. If we can't get it, just move + * on because we haven't actually scrubbed the inobt or the + * inodes yet. + */ + error = xchk_iget(ragi->sc, + XFS_AGINO_TO_INO(mp, sc->sa.pag->pag_agno, + agino), + &ip); + if (error) + continue; + + trace_xrep_iunlink_reload_ondisk(ip); + + if (VFS_I(ip)->i_nlink == 0) + error = xagino_bitmap_set(&ragi->iunlink_bmp, agino, 1); + xchk_irele(sc, ip); + if (error) + break; + } + + return error; +} + +/* + * Find ondisk inodes that are unlinked and not in cache, and mark them in + * iunlink_bmp. We haven't checked the inobt yet, so we don't error out if + * the btree is corrupt. + */ +STATIC void +xrep_iunlink_mark_ondisk( + struct xrep_agi *ragi) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; + struct xfs_btree_cur *cur; + int error; + + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); + error = xfs_btree_query_all(cur, xrep_iunlink_mark_ondisk_rec, ragi); + xfs_btree_del_cursor(cur, error); +} + +/* + * Walk an iunlink bucket's inode list. For each inode that should be on this + * chain, clear its entry in in iunlink_bmp because it's ok and we don't need + * to touch it further. + */ +STATIC int +xrep_iunlink_resolve_bucket( + struct xrep_agi *ragi, + unsigned int bucket) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_inode *ip; + xfs_agino_t prev_agino = NULLAGINO; + xfs_agino_t next_agino = ragi->iunlink_heads[bucket]; + int error = 0; + + while (next_agino != NULLAGINO) { + if (xchk_should_terminate(ragi->sc, &error)) + return error; + + /* Find the next inode in the chain. */ + ip = xfs_iunlink_lookup(sc->sa.pag, next_agino); + if (!ip) { + /* Inode not incore? Terminate the chain. */ + trace_xrep_iunlink_resolve_uncached(sc->sa.pag, + bucket, prev_agino, next_agino); + + next_agino = NULLAGINO; + break; + } + + if (next_agino % XFS_AGI_UNLINKED_BUCKETS != bucket) { + /* + * Inode is in the wrong bucket. Advance the list, + * but pretend we didn't see this inode. + */ + trace_xrep_iunlink_resolve_wronglist(sc->sa.pag, + bucket, prev_agino, next_agino); + + next_agino = ip->i_next_unlinked; + continue; + } + + if (!xfs_inode_on_unlinked_list(ip)) { + /* + * Incore inode doesn't think this inode is on an + * unlinked list. This is probably because we reloaded + * it from disk. Advance the list, but pretend we + * didn't see this inode; we'll fix that later. + */ + trace_xrep_iunlink_resolve_nolist(sc->sa.pag, + bucket, prev_agino, next_agino); + next_agino = ip->i_next_unlinked; + continue; + } + + trace_xrep_iunlink_resolve_ok(sc->sa.pag, bucket, prev_agino, + next_agino); + + /* + * Otherwise, this inode's unlinked pointers are ok. Clear it + * from the unlinked bitmap since we're done with it, and make + * sure the chain is still correct. + */ + error = xagino_bitmap_clear(&ragi->iunlink_bmp, next_agino, 1); + if (error) + return error; + + /* Remember the previous inode's next pointer. */ + if (prev_agino != NULLAGINO) { + error = xrep_iunlink_store_next(ragi, prev_agino, + next_agino); + if (error) + return error; + } + + /* Remember this inode's previous pointer. */ + error = xrep_iunlink_store_prev(ragi, next_agino, prev_agino); + if (error) + return error; + + /* Advance the list and remember this inode. */ + prev_agino = next_agino; + next_agino = ip->i_next_unlinked; + } + + /* Update the previous inode's next pointer. */ + if (prev_agino != NULLAGINO) { + error = xrep_iunlink_store_next(ragi, prev_agino, next_agino); + if (error) + return error; + } + + return 0; +} + +/* Reinsert this unlinked inode into the head of the staged bucket list. */ +STATIC int +xrep_iunlink_add_to_bucket( + struct xrep_agi *ragi, + xfs_agino_t agino) +{ + xfs_agino_t current_head; + unsigned int bucket; + int error; + + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + + /* Point this inode at the current head of the bucket list. */ + current_head = ragi->iunlink_heads[bucket]; + + trace_xrep_iunlink_add_to_bucket(ragi->sc->sa.pag, bucket, agino, + current_head); + + error = xrep_iunlink_store_next(ragi, agino, current_head); + if (error) + return error; + + /* Remember the head inode's previous pointer. */ + if (current_head != NULLAGINO) { + error = xrep_iunlink_store_prev(ragi, current_head, agino); + if (error) + return error; + } + + ragi->iunlink_heads[bucket] = agino; + return 0; +} + +/* Reinsert unlinked inodes into the staged iunlink buckets. */ +STATIC int +xrep_iunlink_add_lost_inodes( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_agi *ragi = priv; + int error; + + for (; len > 0; start++, len--) { + error = xrep_iunlink_add_to_bucket(ragi, start); + if (error) + return error; + } + + return 0; +} + +/* + * Figure out the iunlink bucket values and find inodes that need to be + * reinserted into the list. + */ +STATIC int +xrep_iunlink_rebuild_buckets( + struct xrep_agi *ragi) +{ + unsigned int i; + int error; + + /* + * Walk the ondisk AGI unlinked list to find inodes that are on the + * list but aren't in memory. This can happen if a past log recovery + * tried to clear the iunlinked list but failed. Our scan rebuilds the + * unlinked list using incore inodes, so we must load and link them + * properly. + */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + error = xrep_iunlink_walk_ondisk_bucket(ragi, i); + if (error) + return error; + } + + /* + * Record all the incore unlinked inodes in iunlink_bmp that we didn't + * find by walking the ondisk iunlink buckets. This shouldn't happen, + * but we can't risk forgetting an inode somewhere. + */ + error = xrep_iunlink_mark_incore(ragi); + if (error) + return error; + + /* + * If there are ondisk inodes that are unlinked and are not been loaded + * into cache, record them in iunlink_bmp. + */ + xrep_iunlink_mark_ondisk(ragi); + + /* + * Walk each iunlink bucket to (re)construct as much of the incore list + * as would be correct. For each inode that survives this step, mark + * it clear in iunlink_bmp; we're done with those inodes. + */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + error = xrep_iunlink_resolve_bucket(ragi, i); + if (error) + return error; + } + + /* + * Any unlinked inodes that we didn't find through the bucket list + * walk (or was ignored by the walk) must be inserted into the bucket + * list. Stage this in memory for now. + */ + return xagino_bitmap_walk(&ragi->iunlink_bmp, + xrep_iunlink_add_lost_inodes, ragi); +} + +/* Update i_next_iunlinked for the inode @agino. */ +STATIC int +xrep_iunlink_relink_next( + struct xrep_agi *ragi, + xfarray_idx_t idx, + xfs_agino_t next_agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_inode *ip; + xfarray_idx_t agino = idx - 1; + bool want_rele = false; + int error = 0; + + ip = xfs_iunlink_lookup(pag, agino); + if (!ip) { + xfs_ino_t ino; + xfs_agino_t prev_agino; + + /* + * No inode exists in cache. Load it off the disk so that we + * can reinsert it into the incore unlinked list. + */ + ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + error = xchk_iget(sc, ino, &ip); + if (error) + return -EFSCORRUPTED; + + want_rele = true; + + /* Set the backward pointer since this just came off disk. */ + error = xfarray_load(ragi->iunlink_prev, agino, &prev_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_prev(ip, prev_agino); + ip->i_prev_unlinked = prev_agino; + } + + /* Update the forward pointer. */ + if (ip->i_next_unlinked != next_agino) { + error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_next(ip, next_agino); + ip->i_next_unlinked = next_agino; + } + +out_rele: + /* + * The iunlink lookup doesn't igrab because we hold the AGI buffer lock + * and the inode cannot be reclaimed. However, if we used iget to load + * a missing inode, we must irele it here. + */ + if (want_rele) + xchk_irele(sc, ip); + return error; +} + +/* Update i_prev_iunlinked for the inode @agino. */ +STATIC int +xrep_iunlink_relink_prev( + struct xrep_agi *ragi, + xfarray_idx_t idx, + xfs_agino_t prev_agino) +{ + struct xfs_scrub *sc = ragi->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_inode *ip; + xfarray_idx_t agino = idx - 1; + bool want_rele = false; + int error = 0; + + ASSERT(prev_agino != 0); + + ip = xfs_iunlink_lookup(pag, agino); + if (!ip) { + xfs_ino_t ino; + xfs_agino_t next_agino; + + /* + * No inode exists in cache. Load it off the disk so that we + * can reinsert it into the incore unlinked list. + */ + ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + error = xchk_iget(sc, ino, &ip); + if (error) + return -EFSCORRUPTED; + + want_rele = true; + + /* Set the forward pointer since this just came off disk. */ + error = xfarray_load(ragi->iunlink_prev, agino, &next_agino); + if (error) + goto out_rele; + + error = xfs_iunlink_log_inode(sc->tp, ip, pag, next_agino); + if (error) + goto out_rele; + + trace_xrep_iunlink_relink_next(ip, next_agino); + ip->i_next_unlinked = next_agino; + } + + /* Update the backward pointer. */ + if (ip->i_prev_unlinked != prev_agino) { + trace_xrep_iunlink_relink_prev(ip, prev_agino); + ip->i_prev_unlinked = prev_agino; + } + +out_rele: + /* + * The iunlink lookup doesn't igrab because we hold the AGI buffer lock + * and the inode cannot be reclaimed. However, if we used iget to load + * a missing inode, we must irele it here. + */ + if (want_rele) + xchk_irele(sc, ip); + return error; +} + +/* Log all the iunlink updates we need to finish regenerating the AGI. */ +STATIC int +xrep_iunlink_commit( + struct xrep_agi *ragi) +{ + struct xfs_agi *agi = ragi->agi_bp->b_addr; + xfarray_idx_t idx = XFARRAY_CURSOR_INIT; + xfs_agino_t agino; + unsigned int i; + int error; + + /* Fix all the forward links */ + while ((error = xfarray_iter(ragi->iunlink_next, &idx, &agino)) == 1) { + error = xrep_iunlink_relink_next(ragi, idx, agino); + if (error) + return error; + } + + /* Fix all the back links */ + idx = XFARRAY_CURSOR_INIT; + while ((error = xfarray_iter(ragi->iunlink_prev, &idx, &agino)) == 1) { + error = xrep_iunlink_relink_prev(ragi, idx, agino); + if (error) + return error; + } + + /* Copy the staged iunlink buckets to the new AGI. */ + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + trace_xrep_iunlink_commit_bucket(ragi->sc->sa.pag, i, + be32_to_cpu(ragi->old_agi.agi_unlinked[i]), + ragi->iunlink_heads[i]); + + agi->agi_unlinked[i] = cpu_to_be32(ragi->iunlink_heads[i]); + } + + return 0; +} + /* Trigger reinitialization of the in-core data. */ STATIC int xrep_agi_commit_new( - struct xfs_scrub *sc, - struct xfs_buf *agi_bp) + struct xrep_agi *ragi) { + struct xfs_scrub *sc = ragi->sc; + struct xfs_buf *agi_bp = ragi->agi_bp; struct xfs_perag *pag; struct xfs_agi *agi = agi_bp->b_addr; @@ -956,33 +1712,58 @@ xrep_agi_commit_new( /* Repair the AGI. */ int xrep_agi( - struct xfs_scrub *sc) + struct xfs_scrub *sc) { - struct xrep_find_ag_btree fab[XREP_AGI_MAX] = { - [XREP_AGI_INOBT] = { - .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_inobt_buf_ops, - .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, - }, - [XREP_AGI_FINOBT] = { - .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_finobt_buf_ops, - .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, - }, - [XREP_AGI_END] = { - .buf_ops = NULL - }, - }; - struct xfs_agi old_agi; - struct xfs_mount *mp = sc->mp; - struct xfs_buf *agi_bp; - struct xfs_agi *agi; - int error; + struct xrep_agi *ragi; + struct xfs_mount *mp = sc->mp; + char *descr; + unsigned int i; + int error; /* We require the rmapbt to rebuild anything. */ if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; + sc->buf = kzalloc(sizeof(struct xrep_agi), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + ragi = sc->buf; + ragi->sc = sc; + + ragi->fab[XREP_AGI_INOBT] = (struct xrep_find_ag_btree){ + .rmap_owner = XFS_RMAP_OWN_INOBT, + .buf_ops = &xfs_inobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, + }; + ragi->fab[XREP_AGI_FINOBT] = (struct xrep_find_ag_btree){ + .rmap_owner = XFS_RMAP_OWN_INOBT, + .buf_ops = &xfs_finobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, + }; + ragi->fab[XREP_AGI_END] = (struct xrep_find_ag_btree){ + .buf_ops = NULL, + }; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) + ragi->iunlink_heads[i] = NULLAGINO; + + xagino_bitmap_init(&ragi->iunlink_bmp); + sc->buf_cleanup = xrep_agi_buf_cleanup; + + descr = xchk_xfile_ag_descr(sc, "iunlinked next pointers"); + error = xfarray_create(descr, 0, sizeof(xfs_agino_t), + &ragi->iunlink_next); + kfree(descr); + if (error) + return error; + + descr = xchk_xfile_ag_descr(sc, "iunlinked prev pointers"); + error = xfarray_create(descr, 0, sizeof(xfs_agino_t), + &ragi->iunlink_prev); + kfree(descr); + if (error) + return error; + /* * Make sure we have the AGI buffer, as scrub might have decided it * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED. @@ -990,14 +1771,17 @@ xrep_agi( error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL); if (error) return error; - agi_bp->b_ops = &xfs_agi_buf_ops; - agi = agi_bp->b_addr; + ragi->agi_bp->b_ops = &xfs_agi_buf_ops; /* Find the AGI btree roots. */ - error = xrep_agi_find_btrees(sc, fab); + error = xrep_agi_find_btrees(ragi); + if (error) + return error; + + error = xrep_iunlink_rebuild_buckets(ragi); if (error) return error; @@ -1006,18 +1790,21 @@ xrep_agi( return error; /* Start rewriting the header and implant the btrees we found. */ - xrep_agi_init_header(sc, agi_bp, &old_agi); - xrep_agi_set_roots(sc, agi, fab); - error = xrep_agi_calc_from_btrees(sc, agi_bp); + xrep_agi_init_header(ragi); + xrep_agi_set_roots(ragi); + error = xrep_agi_calc_from_btrees(ragi); + if (error) + goto out_revert; + error = xrep_iunlink_commit(ragi); if (error) goto out_revert; /* Reinitialize in-core state. */ - return xrep_agi_commit_new(sc, agi_bp); + return xrep_agi_commit_new(ragi); out_revert: /* Mark the incore AGI state stale and revert the AGI. */ clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate); - memcpy(agi, &old_agi, sizeof(old_agi)); + memcpy(ragi->agi_bp->b_addr, &ragi->old_agi, sizeof(struct xfs_agi)); return error; } diff --git a/fs/xfs/scrub/agino_bitmap.h b/fs/xfs/scrub/agino_bitmap.h new file mode 100644 index 000000000000..56d7db5f1699 --- /dev/null +++ b/fs/xfs/scrub/agino_bitmap.h @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_AGINO_BITMAP_H__ +#define __XFS_SCRUB_AGINO_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_agino_t */ + +struct xagino_bitmap { + struct xbitmap32 aginobitmap; +}; + +static inline void xagino_bitmap_init(struct xagino_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->aginobitmap); +} + +static inline void xagino_bitmap_destroy(struct xagino_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->aginobitmap); +} + +static inline int xagino_bitmap_clear(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int len) +{ + return xbitmap32_clear(&bitmap->aginobitmap, agino, len); +} + +static inline int xagino_bitmap_set(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int len) +{ + return xbitmap32_set(&bitmap->aginobitmap, agino, len); +} + +static inline bool xagino_bitmap_test(struct xagino_bitmap *bitmap, + xfs_agino_t agino, unsigned int *len) +{ + return xbitmap32_test(&bitmap->aginobitmap, agino, len); +} + +static inline int xagino_bitmap_walk(struct xagino_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->aginobitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_AGINO_BITMAP_H__ */ diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c index d421b253923e..30295898cc8a 100644 --- a/fs/xfs/scrub/alloc_repair.c +++ b/fs/xfs/scrub/alloc_repair.c @@ -778,7 +778,7 @@ xrep_abt_build_new_trees( error = xrep_bnobt_sort_records(ra); if (error) - return error; + goto err_levels; /* Load the free space by block number tree. */ ra->array_cur = XFARRAY_CURSOR_INIT; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 83c7feb38714..708334f9b2bd 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -10,16 +10,20 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_attr_sf.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/attr.h" +#include "scrub/listxattr.h" +#include "scrub/repair.h" /* Free the buffers linked from the xattr buffer. */ static void @@ -35,6 +39,8 @@ xchk_xattr_buf_cleanup( kvfree(ab->value); ab->value = NULL; ab->value_sz = 0; + kvfree(ab->name); + ab->name = NULL; } /* @@ -65,7 +71,7 @@ xchk_xattr_want_freemap( * reallocating the buffer if necessary. Buffer contents are not preserved * across a reallocation. */ -static int +int xchk_setup_xattr_buf( struct xfs_scrub *sc, size_t value_size) @@ -95,6 +101,12 @@ xchk_setup_xattr_buf( return -ENOMEM; } + if (xchk_could_repair(sc)) { + ab->name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS); + if (!ab->name) + return -ENOMEM; + } + resize_value: if (ab->value_sz >= value_size) return 0; @@ -121,6 +133,12 @@ xchk_setup_xattr( { int error; + if (xchk_could_repair(sc)) { + error = xrep_setup_xattr(sc); + if (error) + return error; + } + /* * We failed to get memory while checking attrs, so this time try to * get all the memory we're ever going to need. Allocate the buffer @@ -137,106 +155,105 @@ xchk_setup_xattr( /* Extended Attributes */ -struct xchk_xattr { - struct xfs_attr_list_context context; - struct xfs_scrub *sc; -}; - /* * Check that an extended attribute key can be looked up by hash. * - * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked) - * to call this function for every attribute key in an inode. Once - * we're here, we load the attribute value to see if any errors happen, - * or if we get more or less data than we expected. + * We use the extended attribute walk helper to call this function for every + * attribute key in an inode. Once we're here, we load the attribute value to + * see if any errors happen, or if we get more or less data than we expected. */ -static void -xchk_xattr_listent( - struct xfs_attr_list_context *context, - int flags, - unsigned char *name, - int namelen, - int valuelen) +static int +xchk_xattr_actor( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) { struct xfs_da_args args = { - .op_flags = XFS_DA_OP_NOTIME, - .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK, - .geo = context->dp->i_mount->m_attr_geo, + .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = sc->mp->m_attr_geo, .whichfork = XFS_ATTR_FORK, - .dp = context->dp, + .dp = ip, .name = name, .namelen = namelen, - .hashval = xfs_da_hashname(name, namelen), - .trans = context->tp, + .trans = sc->tp, .valuelen = valuelen, + .owner = ip->i_ino, }; struct xchk_xattr_buf *ab; - struct xchk_xattr *sx; int error = 0; - sx = container_of(context, struct xchk_xattr, context); - ab = sx->sc->buf; + ab = sc->buf; - if (xchk_should_terminate(sx->sc, &error)) { - context->seen_enough = error; - return; + if (xchk_should_terminate(sc, &error)) + return error; + + if (attr_flags & ~XFS_ATTR_ONDISK_MASK) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } - if (flags & XFS_ATTR_INCOMPLETE) { + if (attr_flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ - xchk_ino_set_preen(sx->sc, context->dp->i_ino); - return; + xchk_ino_set_preen(sc, ip->i_ino); + return 0; } - /* Only one namespace bit allowed. */ - if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) { - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - goto fail_xref; + /* Does this name make sense? */ + if (!xfs_attr_namecheck(attr_flags, name, namelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } - /* Does this name make sense? */ - if (!xfs_attr_namecheck(name, namelen)) { - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - goto fail_xref; + /* Check parent pointer record. */ + if ((attr_flags & XFS_ATTR_PARENT) && + !xfs_parent_valuecheck(sc->mp, value, valuelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + return -ECANCELED; } /* - * Local xattr values are stored in the attr leaf block, so we don't - * need to retrieve the value from a remote block to detect corruption - * problems. + * Try to allocate enough memory to extract the attr value. If that + * doesn't work, return -EDEADLOCK as a signal to try again with a + * maximally sized buffer. */ - if (flags & XFS_ATTR_LOCAL) - goto fail_xref; + error = xchk_setup_xattr_buf(sc, valuelen); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + return error; /* - * Try to allocate enough memory to extrat the attr value. If that - * doesn't work, we overload the seen_enough variable to convey - * the error message back to the main scrub function. + * Parent pointers are matched on attr name and value, so we must + * supply the xfs_parent_rec here when confirming that the dabtree + * indexing works correctly. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen); - if (error == -ENOMEM) - error = -EDEADLOCK; - if (error) { - context->seen_enough = error; - return; - } + if (attr_flags & XFS_ATTR_PARENT) + memcpy(ab->value, value, valuelen); args.value = ab->value; + /* + * Get the attr value to ensure that lookup can find this attribute + * through the dabtree indexing and that remote value retrieval also + * works correctly. + */ + xfs_attr_sethash(&args); error = xfs_attr_get_ilocked(&args); /* ENODATA means the hash lookup failed and the attr is bad */ if (error == -ENODATA) error = -EFSCORRUPTED; - if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, + if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, args.blkno, &error)) - goto fail_xref; + return error; if (args.valuelen != valuelen) - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, - args.blkno); -fail_xref: - if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - context->seen_enough = 1; - return; + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, args.blkno); + + return 0; } /* @@ -246,7 +263,7 @@ fail_xref: * Within a char, the lowest bit of the char represents the byte with * the smallest address */ -STATIC bool +bool xchk_xattr_set_map( struct xfs_scrub *sc, unsigned long *map, @@ -403,6 +420,17 @@ xchk_xattr_block( xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); hdrsize = xfs_attr3_leaf_hdr_size(leaf); + /* + * Empty xattr leaf blocks mapped at block 0 are probably a byproduct + * of a race between setxattr and a log shutdown. Anywhere else in the + * attr fork is a corruption. + */ + if (leafhdr.count == 0) { + if (blk->blkno == 0) + xchk_da_set_preen(ds, level); + else + xchk_da_set_corrupt(ds, level); + } if (leafhdr.usedbytes > mp->m_attr_geo->blksize) xchk_da_set_corrupt(ds, level); if (leafhdr.firstused > mp->m_attr_geo->blksize) @@ -411,6 +439,8 @@ xchk_xattr_block( xchk_da_set_corrupt(ds, level); if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize)) xchk_da_set_corrupt(ds, level); + if (leafhdr.holes) + xchk_da_set_preen(ds, level); if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; @@ -463,7 +493,6 @@ xchk_xattr_rec( xfs_dahash_t hash; int nameidx; int hdrsize; - unsigned int badflags; int error; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -493,10 +522,15 @@ xchk_xattr_rec( /* Retrieve the entry and check it. */ hash = be32_to_cpu(ent->hashval); - badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE | - XFS_ATTR_INCOMPLETE); - if ((ent->flags & badflags) != 0) + if (ent->flags & ~XFS_ATTR_ONDISK_MASK) { + xchk_da_set_corrupt(ds, level); + return 0; + } + if (!xfs_attr_check_namespace(ent->flags)) { xchk_da_set_corrupt(ds, level); + return 0; + } + if (ent->flags & XFS_ATTR_LOCAL) { lentry = (struct xfs_attr_leaf_name_local *) (((char *)bp->b_addr) + nameidx); @@ -504,7 +538,10 @@ xchk_xattr_rec( xchk_da_set_corrupt(ds, level); goto out; } - calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen); + calc_hash = xfs_attr_hashval(mp, ent->flags, lentry->nameval, + lentry->namelen, + lentry->nameval + lentry->namelen, + be16_to_cpu(lentry->valuelen)); } else { rentry = (struct xfs_attr_leaf_name_remote *) (((char *)bp->b_addr) + nameidx); @@ -512,7 +549,13 @@ xchk_xattr_rec( xchk_da_set_corrupt(ds, level); goto out; } - calc_hash = xfs_da_hashname(rentry->name, rentry->namelen); + if (ent->flags & XFS_ATTR_PARENT) { + xchk_da_set_corrupt(ds, level); + goto out; + } + calc_hash = xfs_attr_hashval(mp, ent->flags, rentry->name, + rentry->namelen, NULL, + be32_to_cpu(rentry->valuelen)); } if (calc_hash != hash) xchk_da_set_corrupt(ds, level); @@ -556,6 +599,15 @@ xchk_xattr_check_sf( break; } + /* + * Shortform entries do not set LOCAL or INCOMPLETE, so the + * only valid flag bits here are for namespaces. + */ + if (sfe->flags & ~XFS_ATTR_NSP_ONDISK_MASK) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)sfe - (char *)sf, sizeof(struct xfs_attr_sf_entry))) { @@ -588,16 +640,6 @@ int xchk_xattr( struct xfs_scrub *sc) { - struct xchk_xattr sx = { - .sc = sc, - .context = { - .dp = sc->ip, - .tp = sc->tp, - .resynch = 1, - .put_listent = xchk_xattr_listent, - .allow_incomplete = true, - }, - }; xfs_dablk_t last_checked = -1U; int error = 0; @@ -626,12 +668,6 @@ xchk_xattr( /* * Look up every xattr in this file by name and hash. * - * Use the backend implementation of xfs_attr_list to call - * xchk_xattr_listent on every attribute key in this inode. - * In other words, we use the same iterator/callback mechanism - * that listattr uses to scrub extended attributes, though in our - * _listent function, we check the value of the attribute. - * * The VFS only locks i_rwsem when modifying attrs, so keep all * three locks held because that's the only way to ensure we're * the only thread poking into the da btree. We traverse the da @@ -639,13 +675,9 @@ xchk_xattr( * iteration, which doesn't really follow the usual buffer * locking order. */ - error = xfs_attr_list_ilocked(&sx.context); + error = xchk_xattr_walk(sc, sc->ip, xchk_xattr_actor, NULL, NULL); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) return error; - /* Did our listent function try to return any errors? */ - if (sx.context.seen_enough < 0) - return sx.context.seen_enough; - return 0; } diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 48fd9402c432..7db58af56646 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -16,9 +16,16 @@ struct xchk_xattr_buf { /* Bitmap of free space in xattr leaf blocks. */ unsigned long *freemap; + /* Memory buffer used to hold salvaged xattr names. */ + unsigned char *name; + /* Memory buffer used to extract xattr values. */ void *value; size_t value_sz; }; +bool xchk_xattr_set_map(struct xfs_scrub *sc, unsigned long *map, + unsigned int start, unsigned int len); +int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size); + #endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c new file mode 100644 index 000000000000..c7eb94069caf --- /dev/null +++ b/fs/xfs/scrub/attr_repair.c @@ -0,0 +1,1663 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_acl.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/attr.h" +#include "scrub/reap.h" +#include "scrub/attr_repair.h" + +/* + * Extended Attribute Repair + * ========================= + * + * We repair extended attributes by reading the attr leaf blocks looking for + * attributes entries that look salvageable (name passes verifiers, value can + * be retrieved, etc). Each extended attribute worth salvaging is stashed in + * memory, and the stashed entries are periodically replayed into a temporary + * file to constrain memory use. Batching the construction of the temporary + * extended attribute structure in this fashion reduces lock cycling of the + * file being repaired and the temporary file. + * + * When salvaging completes, the remaining stashed attributes are replayed to + * the temporary file. An atomic file contents exchange is used to commit the + * new xattr blocks to the file being repaired. This will disrupt attrmulti + * cursors. + */ + +struct xrep_xattr_key { + /* Cookie for retrieval of the xattr name. */ + xfblob_cookie name_cookie; + + /* Cookie for retrieval of the xattr value. */ + xfblob_cookie value_cookie; + + /* XFS_ATTR_* flags */ + int flags; + + /* Length of the value and name. */ + uint32_t valuelen; + uint16_t namelen; +}; + +/* + * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write + * them to the temp file. + */ +#define XREP_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_xattr { + struct xfs_scrub *sc; + + /* Information for exchanging attr fork mappings at the end. */ + struct xrep_tempexch tx; + + /* xattr keys */ + struct xfarray *xattr_records; + + /* xattr values */ + struct xfblob *xattr_blobs; + + /* Number of attributes that we are salvaging. */ + unsigned long long attrs_found; + + /* Can we flush stashed attrs to the tempfile? */ + bool can_flush; + + /* Did the live update fail, and hence the repair is now out of date? */ + bool live_update_aborted; + + /* Lock protecting parent pointer updates */ + struct mutex lock; + + /* Fixed-size array of xrep_xattr_pptr structures. */ + struct xfarray *pptr_recs; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* Hook to capture parent pointer updates. */ + struct xfs_dir_hook dhook; + + /* Scratch buffer for capturing parent pointers. */ + struct xfs_da_args pptr_args; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; +}; + +/* Create a parent pointer in the tempfile. */ +#define XREP_XATTR_PPTR_ADD (1) + +/* Remove a parent pointer from the tempfile. */ +#define XREP_XATTR_PPTR_REMOVE (2) + +/* A stashed parent pointer update. */ +struct xrep_xattr_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; + + /* XREP_XATTR_PPTR_{ADD,REMOVE} */ + uint8_t action; +}; + +/* Set up to recreate the extended attributes. */ +int +xrep_setup_xattr( + struct xfs_scrub *sc) +{ + if (xfs_has_parent(sc->mp)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + return xrep_tempfile_create(sc, S_IFREG); +} + +/* + * Decide if we want to salvage this attribute. We don't bother with + * incomplete or oversized keys or values. The @value parameter can be null + * for remote attrs. + */ +STATIC int +xrep_xattr_want_salvage( + struct xrep_xattr *rx, + unsigned int attr_flags, + const void *name, + int namelen, + const void *value, + int valuelen) +{ + if (attr_flags & XFS_ATTR_INCOMPLETE) + return false; + if (namelen > XATTR_NAME_MAX || namelen <= 0) + return false; + if (!xfs_attr_namecheck(attr_flags, name, namelen)) + return false; + if (valuelen > XATTR_SIZE_MAX || valuelen < 0) + return false; + if (attr_flags & XFS_ATTR_PARENT) + return xfs_parent_valuecheck(rx->sc->mp, value, valuelen); + + return true; +} + +/* Allocate an in-core record to hold xattrs while we rebuild the xattr data. */ +STATIC int +xrep_xattr_salvage_key( + struct xrep_xattr *rx, + int flags, + unsigned char *name, + int namelen, + unsigned char *value, + int valuelen) +{ + struct xrep_xattr_key key = { + .valuelen = valuelen, + .flags = flags & XFS_ATTR_NSP_ONDISK_MASK, + }; + unsigned int i = 0; + int error = 0; + + if (xchk_should_terminate(rx->sc, &error)) + return error; + + /* + * Truncate the name to the first character that would trip namecheck. + * If we no longer have a name after that, ignore this attribute. + */ + if (flags & XFS_ATTR_PARENT) { + key.namelen = namelen; + + trace_xrep_xattr_salvage_pptr(rx->sc->ip, flags, name, + key.namelen, value, valuelen); + } else { + while (i < namelen && name[i] != 0) + i++; + if (i == 0) + return 0; + key.namelen = i; + + trace_xrep_xattr_salvage_rec(rx->sc->ip, flags, name, + key.namelen, valuelen); + } + + error = xfblob_store(rx->xattr_blobs, &key.name_cookie, name, + key.namelen); + if (error) + return error; + + error = xfblob_store(rx->xattr_blobs, &key.value_cookie, value, + key.valuelen); + if (error) + return error; + + error = xfarray_append(rx->xattr_records, &key); + if (error) + return error; + + rx->attrs_found++; + return 0; +} + +/* + * Record a shortform extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_sf_attr( + struct xrep_xattr *rx, + struct xfs_attr_sf_hdr *hdr, + struct xfs_attr_sf_entry *sfe) +{ + struct xfs_scrub *sc = rx->sc; + struct xchk_xattr_buf *ab = sc->buf; + unsigned char *name = sfe->nameval; + unsigned char *value = &sfe->nameval[sfe->namelen]; + + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)name - (char *)hdr, + sfe->namelen)) + return 0; + + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)value - (char *)hdr, + sfe->valuelen)) + return 0; + + if (!xrep_xattr_want_salvage(rx, sfe->flags, sfe->nameval, + sfe->namelen, value, sfe->valuelen)) + return 0; + + return xrep_xattr_salvage_key(rx, sfe->flags, sfe->nameval, + sfe->namelen, value, sfe->valuelen); +} + +/* + * Record a local format extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_local_attr( + struct xrep_xattr *rx, + struct xfs_attr_leaf_entry *ent, + unsigned int nameidx, + const char *buf_end, + struct xfs_attr_leaf_name_local *lentry) +{ + struct xchk_xattr_buf *ab = rx->sc->buf; + unsigned char *value; + unsigned int valuelen; + unsigned int namesize; + + /* + * Decode the leaf local entry format. If something seems wrong, we + * junk the attribute. + */ + value = &lentry->nameval[lentry->namelen]; + valuelen = be16_to_cpu(lentry->valuelen); + namesize = xfs_attr_leaf_entsize_local(lentry->namelen, valuelen); + if ((char *)lentry + namesize > buf_end) + return 0; + if (!xrep_xattr_want_salvage(rx, ent->flags, lentry->nameval, + lentry->namelen, value, valuelen)) + return 0; + if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize)) + return 0; + + /* Try to save this attribute. */ + return xrep_xattr_salvage_key(rx, ent->flags, lentry->nameval, + lentry->namelen, value, valuelen); +} + +/* + * Record a remote format extended attribute key & value for later reinsertion + * into the inode. + */ +STATIC int +xrep_xattr_salvage_remote_attr( + struct xrep_xattr *rx, + struct xfs_attr_leaf_entry *ent, + unsigned int nameidx, + const char *buf_end, + struct xfs_attr_leaf_name_remote *rentry, + unsigned int ent_idx, + struct xfs_buf *leaf_bp) +{ + struct xchk_xattr_buf *ab = rx->sc->buf; + struct xfs_da_args args = { + .trans = rx->sc->tp, + .dp = rx->sc->ip, + .index = ent_idx, + .geo = rx->sc->mp->m_attr_geo, + .owner = rx->sc->ip->i_ino, + .attr_filter = ent->flags & XFS_ATTR_NSP_ONDISK_MASK, + .namelen = rentry->namelen, + .name = rentry->name, + .value = ab->value, + .valuelen = be32_to_cpu(rentry->valuelen), + }; + unsigned int namesize; + int error; + + /* + * Decode the leaf remote entry format. If something seems wrong, we + * junk the attribute. Note that we should never find a zero-length + * remote attribute value. + */ + namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); + if ((char *)rentry + namesize > buf_end) + return 0; + if (args.valuelen == 0 || + !xrep_xattr_want_salvage(rx, ent->flags, rentry->name, + rentry->namelen, NULL, args.valuelen)) + return 0; + if (!xchk_xattr_set_map(rx->sc, ab->usedmap, nameidx, namesize)) + return 0; + + /* + * Enlarge the buffer (if needed) to hold the value that we're trying + * to salvage from the old extended attribute data. + */ + error = xchk_setup_xattr_buf(rx->sc, args.valuelen); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + return error; + + /* Look up the remote value and stash it for reconstruction. */ + error = xfs_attr3_leaf_getvalue(leaf_bp, &args); + if (error || args.rmtblkno == 0) + goto err_free; + + error = xfs_attr_rmtval_get(&args); + if (error) + goto err_free; + + /* Try to save this attribute. */ + error = xrep_xattr_salvage_key(rx, ent->flags, rentry->name, + rentry->namelen, ab->value, args.valuelen); +err_free: + /* remote value was garbage, junk it */ + if (error == -EFSBADCRC || error == -EFSCORRUPTED) + error = 0; + return error; +} + +/* Extract every xattr key that we can from this attr fork block. */ +STATIC int +xrep_xattr_recover_leaf( + struct xrep_xattr *rx, + struct xfs_buf *bp) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_scrub *sc = rx->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_name_local *lentry; + struct xfs_attr_leaf_name_remote *rentry; + struct xfs_attr_leaf_entry *ent; + struct xfs_attr_leaf_entry *entries; + struct xchk_xattr_buf *ab = rx->sc->buf; + char *buf_end; + size_t off; + unsigned int nameidx; + unsigned int hdrsize; + int i; + int error = 0; + + bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize); + + /* Check the leaf header */ + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + hdrsize = xfs_attr3_leaf_hdr_size(leaf); + xchk_xattr_set_map(sc, ab->usedmap, 0, hdrsize); + entries = xfs_attr3_leaf_entryp(leaf); + + buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; + for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) { + if (xchk_should_terminate(sc, &error)) + return error; + + /* Skip key if it conflicts with something else? */ + off = (char *)ent - (char *)leaf; + if (!xchk_xattr_set_map(sc, ab->usedmap, off, + sizeof(xfs_attr_leaf_entry_t))) + continue; + + /* Check the name information. */ + nameidx = be16_to_cpu(ent->nameidx); + if (nameidx < leafhdr.firstused || + nameidx >= mp->m_attr_geo->blksize) + continue; + + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = xfs_attr3_leaf_name_local(leaf, i); + error = xrep_xattr_salvage_local_attr(rx, ent, nameidx, + buf_end, lentry); + } else { + rentry = xfs_attr3_leaf_name_remote(leaf, i); + error = xrep_xattr_salvage_remote_attr(rx, ent, nameidx, + buf_end, rentry, i, bp); + } + if (error) + return error; + } + + return 0; +} + +/* Try to recover shortform attrs. */ +STATIC int +xrep_xattr_recover_sf( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + struct xchk_xattr_buf *ab = sc->buf; + struct xfs_attr_sf_hdr *hdr; + struct xfs_attr_sf_entry *sfe; + struct xfs_attr_sf_entry *next; + struct xfs_ifork *ifp; + unsigned char *end; + int i; + int error = 0; + + ifp = xfs_ifork_ptr(rx->sc->ip, XFS_ATTR_FORK); + hdr = ifp->if_data; + + bitmap_zero(ab->usedmap, ifp->if_bytes); + end = (unsigned char *)ifp->if_data + ifp->if_bytes; + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*hdr)); + + sfe = xfs_attr_sf_firstentry(hdr); + if ((unsigned char *)sfe > end) + return 0; + + for (i = 0; i < hdr->count; i++) { + if (xchk_should_terminate(sc, &error)) + return error; + + next = xfs_attr_sf_nextentry(sfe); + if ((unsigned char *)next > end) + break; + + if (xchk_xattr_set_map(sc, ab->usedmap, + (char *)sfe - (char *)hdr, + sizeof(struct xfs_attr_sf_entry))) { + /* + * No conflicts with the sf entry; let's save this + * attribute. + */ + error = xrep_xattr_salvage_sf_attr(rx, hdr, sfe); + if (error) + return error; + } + + sfe = next; + } + + return 0; +} + +/* + * Try to return a buffer of xattr data for a given physical extent. + * + * Because the buffer cache get function complains if it finds a buffer + * matching the block number but not matching the length, we must be careful to + * look for incore buffers (up to the maximum length of a remote value) that + * could be hiding anywhere in the physical range. If we find an incore + * buffer, we can pass that to the caller. Optionally, read a single block and + * pass that back. + * + * Note the subtlety that remote attr value blocks for which there is no incore + * buffer will be passed to the callback one block at a time. These buffers + * will not have any ops attached and must be staled to prevent aliasing with + * multiblock buffers once we drop the ILOCK. + */ +STATIC int +xrep_xattr_find_buf( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + xfs_extlen_t max_len, + bool can_read, + struct xfs_buf **bpp) +{ + struct xrep_bufscan scan = { + .daddr = XFS_FSB_TO_DADDR(mp, fsbno), + .max_sectors = xrep_bufscan_max_sectors(mp, max_len), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { + *bpp = bp; + return 0; + } + + if (!can_read) { + *bpp = NULL; + return 0; + } + + return xfs_buf_read(mp->m_ddev_targp, scan.daddr, XFS_FSB_TO_BB(mp, 1), + XBF_TRYLOCK, bpp, NULL); +} + +/* + * Deal with a buffer that we found during our walk of the attr fork. + * + * Attribute leaf and node blocks are simple -- they're a single block, so we + * can walk them one at a time and we never have to worry about discontiguous + * multiblock buffers like we do for directories. + * + * Unfortunately, remote attr blocks add a lot of complexity here. Each disk + * block is totally self contained, in the sense that the v5 header provides no + * indication that there could be more data in the next block. The incore + * buffers can span multiple blocks, though they never cross extent records. + * However, they don't necessarily start or end on an extent record boundary. + * Therefore, we need a special buffer find function to walk the buffer cache + * for us. + * + * The caller must hold the ILOCK on the file being repaired. We use + * XBF_TRYLOCK here to skip any locked buffer on the assumption that we don't + * own the block and don't want to hang the system on a potentially garbage + * buffer. + */ +STATIC int +xrep_xattr_recover_block( + struct xrep_xattr *rx, + xfs_dablk_t dabno, + xfs_fsblock_t fsbno, + xfs_extlen_t max_len, + xfs_extlen_t *actual_len) +{ + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + int error; + + error = xrep_xattr_find_buf(rx->sc->mp, fsbno, max_len, true, &bp); + if (error) + return error; + info = bp->b_addr; + *actual_len = XFS_BB_TO_FSB(rx->sc->mp, bp->b_length); + + trace_xrep_xattr_recover_leafblock(rx->sc->ip, dabno, + be16_to_cpu(info->magic)); + + /* + * If the buffer has the right magic number for an attr leaf block and + * passes a structure check (we don't care about checksums), salvage + * as much as we can from the block. */ + if (info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) && + xrep_buf_verify_struct(bp, &xfs_attr3_leaf_buf_ops) && + xfs_attr3_leaf_header_check(bp, rx->sc->ip->i_ino) == NULL) + error = xrep_xattr_recover_leaf(rx, bp); + + /* + * If the buffer didn't already have buffer ops set, it was read in by + * the _find_buf function and could very well be /part/ of a multiblock + * remote block. Mark it stale so that it doesn't hang around in + * memory to cause problems. + */ + if (bp->b_ops == NULL) + xfs_buf_stale(bp); + + xfs_buf_relse(bp); + return error; +} + +/* Insert one xattr key/value. */ +STATIC int +xrep_xattr_insert_rec( + struct xrep_xattr *rx, + const struct xrep_xattr_key *key) +{ + struct xfs_da_args args = { + .dp = rx->sc->tempip, + .attr_filter = key->flags, + .namelen = key->namelen, + .valuelen = key->valuelen, + .owner = rx->sc->ip->i_ino, + .geo = rx->sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + struct xchk_xattr_buf *ab = rx->sc->buf; + int error; + + /* + * Grab pointers to the scrub buffer so that we can use them to insert + * attrs into the temp file. + */ + args.name = ab->name; + args.value = ab->value; + + /* + * The attribute name is stored near the end of the in-core buffer, + * though we reserve one more byte to ensure null termination. + */ + ab->name[XATTR_NAME_MAX] = 0; + + error = xfblob_load(rx->xattr_blobs, key->name_cookie, ab->name, + key->namelen); + if (error) + return error; + + error = xfblob_free(rx->xattr_blobs, key->name_cookie); + if (error) + return error; + + error = xfblob_load(rx->xattr_blobs, key->value_cookie, args.value, + key->valuelen); + if (error) + return error; + + error = xfblob_free(rx->xattr_blobs, key->value_cookie); + if (error) + return error; + + ab->name[key->namelen] = 0; + + if (key->flags & XFS_ATTR_PARENT) { + trace_xrep_xattr_insert_pptr(rx->sc->tempip, key->flags, + ab->name, key->namelen, ab->value, + key->valuelen); + args.op_flags |= XFS_DA_OP_LOGGED; + } else { + trace_xrep_xattr_insert_rec(rx->sc->tempip, key->flags, + ab->name, key->namelen, key->valuelen); + } + + /* + * xfs_attr_set creates and commits its own transaction. If the attr + * already exists, we'll just drop it during the rebuild. + */ + xfs_attr_sethash(&args); + error = xfs_attr_set(&args, XFS_ATTRUPDATE_CREATE, false); + if (error == -EEXIST) + error = 0; + + return error; +} + +/* + * Periodically flush salvaged attributes to the temporary file. This is done + * to reduce the memory requirements of the xattr rebuild because files can + * contain millions of attributes. + */ +STATIC int +xrep_xattr_flush_stashed( + struct xrep_xattr *rx) +{ + xfarray_idx_t array_cur; + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and a scrub transaction + * that we use during xattr salvaging to avoid livelocking if there + * are cycles in the xattr structures. We hold ILOCK_EXCL on both + * the inode being repaired, though it is not ijoined to the scrub + * transaction. + * + * To constrain kernel memory use, we occasionally flush salvaged + * xattrs from the xfarray and xfblob structures into the temporary + * file in preparation for exchanging the xattr structures at the end. + * Updating the temporary file requires a transaction, so we commit the + * scrub transaction and drop the two ILOCKs so that xfs_attr_set can + * allocate whatever transaction it wants. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from modifying the damaged xattr data while we + * repair it. + */ + error = xrep_trans_commit(rx->sc); + if (error) + return error; + xchk_iunlock(rx->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify xattrs. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rx->sc); + if (error) + return error; + + /* Add all the salvaged attrs to the temporary file. */ + foreach_xfarray_idx(rx->xattr_records, array_cur) { + struct xrep_xattr_key key; + + error = xfarray_load(rx->xattr_records, array_cur, &key); + if (error) + return error; + + error = xrep_xattr_insert_rec(rx, &key); + if (error) + return error; + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rx->xattr_records); + xfblob_truncate(rx->xattr_blobs); + + xrep_tempfile_iounlock(rx->sc); + + /* Recreate the salvage transaction and relock the inode. */ + error = xchk_trans_alloc(rx->sc, 0); + if (error) + return error; + xchk_ilock(rx->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much xattr data in memory. */ +static inline bool +xrep_xattr_want_flush_stashed( + struct xrep_xattr *rx) +{ + unsigned long long bytes; + + if (!rx->can_flush) + return false; + + bytes = xfarray_bytes(rx->xattr_records) + + xfblob_bytes(rx->xattr_blobs); + return bytes > XREP_XATTR_MAX_STASH_BYTES; +} + +/* + * Did we observe rename changing parent pointer xattrs while we were flushing + * salvaged attrs? + */ +static inline bool +xrep_xattr_saw_pptr_conflict( + struct xrep_xattr *rx) +{ + bool ret; + + ASSERT(rx->can_flush); + + if (!xfs_has_parent(rx->sc->mp)) + return false; + + xfs_assert_ilocked(rx->sc->ip, XFS_ILOCK_EXCL); + + mutex_lock(&rx->lock); + ret = xfarray_bytes(rx->pptr_recs) > 0; + mutex_unlock(&rx->lock); + + return ret; +} + +/* + * Reset the entire repair state back to initial conditions, now that we've + * detected a parent pointer update to the attr structure while we were + * flushing salvaged attrs. See the locking notes in dir_repair.c for more + * information on why this is all necessary. + */ +STATIC int +xrep_xattr_full_reset( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + struct xfs_attr_sf_hdr *hdr; + struct xfs_ifork *ifp = &sc->tempip->i_af; + int error; + + trace_xrep_xattr_full_reset(sc->ip, sc->tempip); + + /* The temporary file's data fork had better not be in btree format. */ + if (sc->tempip->i_df.if_format == XFS_DINODE_FMT_BTREE) { + ASSERT(0); + return -EIO; + } + + /* + * We begin in transaction context with sc->ip ILOCKed but not joined + * to the transaction. To reset to the initial state, we must hold + * sc->ip's ILOCK to prevent rename from updating parent pointer + * information and the tempfile's ILOCK to clear its contents. + */ + xchk_iunlock(rx->sc, XFS_ILOCK_EXCL); + xrep_tempfile_ilock_both(sc); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* + * Free all the blocks of the attr fork of the temp file, and reset + * it back to local format. + */ + if (xfs_ifork_has_extents(&sc->tempip->i_af)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK); + if (error) + return error; + + ASSERT(ifp->if_bytes == 0); + ifp->if_format = XFS_DINODE_FMT_LOCAL; + xfs_idata_realloc(sc->tempip, sizeof(*hdr), XFS_ATTR_FORK); + } + + /* Reinitialize the attr fork to an empty shortform structure. */ + hdr = ifp->if_data; + memset(hdr, 0, sizeof(*hdr)); + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + /* + * Roll this transaction to commit our reset ondisk. The tempfile + * should no longer be joined to the transaction, so we drop its ILOCK. + * This should leave us in transaction context with sc->ip ILOCKed but + * not joined to the transaction. + */ + error = xrep_roll_trans(sc); + if (error) + return error; + xrep_tempfile_iunlock(sc); + + /* + * Erase any accumulated parent pointer updates now that we've erased + * the tempfile's attr fork. We're resetting the entire repair state + * back to where we were initially, except now we won't flush salvaged + * xattrs until the very end. + */ + mutex_lock(&rx->lock); + xfarray_truncate(rx->pptr_recs); + xfblob_truncate(rx->pptr_names); + mutex_unlock(&rx->lock); + + rx->can_flush = false; + rx->attrs_found = 0; + + ASSERT(xfarray_bytes(rx->xattr_records) == 0); + ASSERT(xfblob_bytes(rx->xattr_blobs) == 0); + return 0; +} + +/* Extract as many attribute keys and values as we can. */ +STATIC int +xrep_xattr_recover( + struct xrep_xattr *rx) +{ + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = rx->sc; + struct xfs_da_geometry *geo = sc->mp->m_attr_geo; + xfs_fileoff_t offset; + xfs_extlen_t len; + xfs_dablk_t dabno; + int nmap; + int error; + +restart: + /* + * Iterate each xattr leaf block in the attr fork to scan them for any + * attributes that we might salvage. + */ + for (offset = 0; + offset < XFS_MAX_FILEOFF; + offset = got.br_startoff + got.br_blockcount) { + nmap = 1; + error = xfs_bmapi_read(sc->ip, offset, XFS_MAX_FILEOFF - offset, + &got, &nmap, XFS_BMAPI_ATTRFORK); + if (error) + return error; + if (nmap != 1) + return -EFSCORRUPTED; + if (!xfs_bmap_is_written_extent(&got)) + continue; + + for (dabno = round_up(got.br_startoff, geo->fsbcount); + dabno < got.br_startoff + got.br_blockcount; + dabno += len) { + xfs_fileoff_t curr_offset = dabno - got.br_startoff; + xfs_extlen_t maxlen; + + if (xchk_should_terminate(rx->sc, &error)) + return error; + + maxlen = min_t(xfs_filblks_t, INT_MAX, + got.br_blockcount - curr_offset); + error = xrep_xattr_recover_block(rx, dabno, + curr_offset + got.br_startblock, + maxlen, &len); + if (error) + return error; + + if (xrep_xattr_want_flush_stashed(rx)) { + error = xrep_xattr_flush_stashed(rx); + if (error) + return error; + + if (xrep_xattr_saw_pptr_conflict(rx)) { + error = xrep_xattr_full_reset(rx); + if (error) + return error; + + goto restart; + } + } + } + } + + return 0; +} + +/* + * Reset the extended attribute fork to a state where we can start re-adding + * the salvaged attributes. + */ +STATIC int +xrep_xattr_fork_remove( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + struct xfs_attr_sf_hdr *hdr; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); + + /* + * If the data fork is in btree format, we can't change di_forkoff + * because we could run afoul of the rule that the data fork isn't + * supposed to be in btree format if there's enough space in the fork + * that it could have used extents format. Instead, reinitialize the + * attr fork to have a shortform structure with zero attributes. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) { + ifp->if_format = XFS_DINODE_FMT_LOCAL; + hdr = xfs_idata_realloc(ip, (int)sizeof(*hdr) - ifp->if_bytes, + XFS_ATTR_FORK); + hdr->count = 0; + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(sc->tp, ip, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + return 0; + } + + /* If we still have attr fork extents, something's wrong. */ + if (ifp->if_nextents != 0) { + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec irec; + unsigned int i = 0; + + xfs_emerg(sc->mp, + "inode 0x%llx attr fork still has %llu attr extents, format %d?!", + ip->i_ino, ifp->if_nextents, ifp->if_format); + for_each_xfs_iext(ifp, &icur, &irec) { + xfs_err(sc->mp, + "[%u]: startoff %llu startblock %llu blockcount %llu state %u", + i++, irec.br_startoff, + irec.br_startblock, irec.br_blockcount, + irec.br_state); + } + ASSERT(0); + return -EFSCORRUPTED; + } + + xfs_attr_fork_remove(ip, sc->tp); + return 0; +} + +/* + * Free all the attribute fork blocks of the file being repaired and delete the + * fork. The caller must ILOCK the scrub file and join it to the transaction. + * This function returns with the inode joined to a clean transaction. + */ +int +xrep_xattr_reset_fork( + struct xfs_scrub *sc) +{ + int error; + + trace_xrep_xattr_reset_fork(sc->ip, sc->ip); + + /* Unmap all the attr blocks. */ + if (xfs_ifork_has_extents(&sc->ip->i_af)) { + error = xrep_reap_ifork(sc, sc->ip, XFS_ATTR_FORK); + if (error) + return error; + } + + error = xrep_xattr_fork_remove(sc, sc->ip); + if (error) + return error; + + return xfs_trans_roll_inode(&sc->tp, sc->ip); +} + +/* + * Free all the attribute fork blocks of the temporary file and delete the attr + * fork. The caller must ILOCK the tempfile and join it to the transaction. + * This function returns with the inode joined to a clean scrub transaction. + */ +int +xrep_xattr_reset_tempfile_fork( + struct xfs_scrub *sc) +{ + int error; + + trace_xrep_xattr_reset_fork(sc->ip, sc->tempip); + + /* + * Wipe out the attr fork of the temp file so that regular inode + * inactivation won't trip over the corrupt attr fork. + */ + if (xfs_ifork_has_extents(&sc->tempip->i_af)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_ATTR_FORK); + if (error) + return error; + } + + return xrep_xattr_fork_remove(sc, sc->tempip); +} + +/* + * Find all the extended attributes for this inode by scraping them out of the + * attribute key blocks by hand, and flushing them into the temp file. + * When we're done, free the staging memory before exchanging the xattr + * structures to reduce memory usage. + */ +STATIC int +xrep_xattr_salvage_attributes( + struct xrep_xattr *rx) +{ + struct xfs_inode *ip = rx->sc->ip; + int error; + + /* Short format xattrs are easy! */ + if (rx->sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) { + error = xrep_xattr_recover_sf(rx); + if (error) + return error; + + return xrep_xattr_flush_stashed(rx); + } + + /* + * For non-inline xattr structures, the salvage function scans the + * buffer cache looking for potential attr leaf blocks. The scan + * requires the ability to lock any buffer found and runs independently + * of any transaction <-> buffer item <-> buffer linkage. Therefore, + * roll the transaction to ensure there are no buffers joined. We hold + * the ILOCK independently of the transaction. + */ + error = xfs_trans_roll(&rx->sc->tp); + if (error) + return error; + + error = xfs_iread_extents(rx->sc->tp, ip, XFS_ATTR_FORK); + if (error) + return error; + + error = xrep_xattr_recover(rx); + if (error) + return error; + + return xrep_xattr_flush_stashed(rx); +} + +/* + * Add this stashed incore parent pointer to the temporary file. The caller + * must hold the tempdir's IOLOCK, must not hold any ILOCKs, and must not be in + * transaction context. + */ +STATIC int +xrep_xattr_replay_pptr_update( + struct xrep_xattr *rx, + const struct xfs_name *xname, + struct xrep_xattr_pptr *pptr) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + switch (pptr->action) { + case XREP_XATTR_PPTR_ADD: + /* Create parent pointer. */ + trace_xrep_xattr_replay_parentadd(sc->tempip, xname, + &pptr->pptr_rec); + + error = xfs_parent_set(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rx->pptr_args); + ASSERT(error != -EEXIST); + return error; + case XREP_XATTR_PPTR_REMOVE: + /* Remove parent pointer. */ + trace_xrep_xattr_replay_parentremove(sc->tempip, xname, + &pptr->pptr_rec); + + error = xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rx->pptr_args); + ASSERT(error != -ENOATTR); + return error; + } + + ASSERT(0); + return -EIO; +} + +/* + * Flush stashed parent pointer updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the xattr rebuild, since + * files can have a lot of hardlinks and the fs can be busy. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile + * IOLOCK. + */ +STATIC int +xrep_xattr_replay_pptr_updates( + struct xrep_xattr *rx) +{ + xfarray_idx_t array_cur; + int error; + + mutex_lock(&rx->lock); + foreach_xfarray_idx(rx->pptr_recs, array_cur) { + struct xrep_xattr_pptr pptr; + + error = xfarray_load(rx->pptr_recs, array_cur, &pptr); + if (error) + goto out_unlock; + + error = xfblob_loadname(rx->pptr_names, pptr.name_cookie, + &rx->xname, pptr.namelen); + if (error) + goto out_unlock; + mutex_unlock(&rx->lock); + + error = xrep_xattr_replay_pptr_update(rx, &rx->xname, &pptr); + if (error) + return error; + + mutex_lock(&rx->lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rx->pptr_recs); + xfblob_truncate(rx->pptr_names); + mutex_unlock(&rx->lock); + return 0; +out_unlock: + mutex_unlock(&rx->lock); + return error; +} + +/* + * Remember that we want to create a parent pointer in the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_xattr_stash_parentadd( + struct xrep_xattr *rx, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_xattr_pptr pptr = { + .action = XREP_XATTR_PPTR_ADD, + .namelen = name->len, + }; + int error; + + trace_xrep_xattr_stash_parentadd(rx->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rx->pptr_recs, &pptr); +} + +/* + * Remember that we want to remove a parent pointer from the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_xattr_stash_parentremove( + struct xrep_xattr *rx, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_xattr_pptr pptr = { + .action = XREP_XATTR_PPTR_REMOVE, + .namelen = name->len, + }; + int error; + + trace_xrep_xattr_stash_parentremove(rx->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rx->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rx->pptr_recs, &pptr); +} + +/* + * Capture dirent updates being made by other threads. We will have to replay + * the parent pointer updates before exchanging attr forks. + */ +STATIC int +xrep_xattr_live_dirent_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_xattr *rx; + struct xfs_scrub *sc; + int error; + + rx = container_of(nb, struct xrep_xattr, dhook.dirent_hook.nb); + sc = rx->sc; + + /* + * This thread updated a dirent that points to the file that we're + * repairing, so stash the update for replay against the temporary + * file. + */ + if (p->ip->i_ino != sc->ip->i_ino) + return NOTIFY_DONE; + + mutex_lock(&rx->lock); + if (p->delta > 0) + error = xrep_xattr_stash_parentadd(rx, p->name, p->dp); + else + error = xrep_xattr_stash_parentremove(rx, p->name, p->dp); + if (error) + rx->live_update_aborted = true; + mutex_unlock(&rx->lock); + return NOTIFY_DONE; +} + +/* + * Prepare both inodes' attribute forks for an exchange. Promote the tempfile + * from short format to leaf format, and if the file being repaired has a short + * format attr fork, turn it into an empty extent list. + */ +STATIC int +xrep_xattr_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the tempfile's attributes are in shortform format, convert that + * to a single leaf extent so that we can use the atomic mapping + * exchange. + */ + if (temp_local) { + struct xfs_da_args args = { + .dp = sc->tempip, + .geo = sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .trans = sc->tp, + .total = 1, + .owner = sc->ip->i_ino, + }; + + error = xfs_attr_shortform_to_leaf(&args); + if (error) + return error; + + /* + * Roll the deferred log items to get us back to a clean + * transaction. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform attribute fork, convert + * that to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + } + + return 0; +} + +/* Exchange the temporary file's attribute fork with the one being repaired. */ +int +xrep_xattr_swap( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + bool ip_local, temp_local; + int error = 0; + + ip_local = sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_af.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both files have a local format attr fork and the rebuilt + * xattr data would fit in the repaired file's attr fork, just copy + * the contents from the tempfile and declare ourselves done. + */ + if (ip_local && temp_local) { + int forkoff; + int newsize; + + newsize = xfs_attr_sf_totsize(sc->tempip); + forkoff = xfs_attr_shortform_bytesfit(sc->ip, newsize); + if (forkoff > 0) { + sc->ip->i_forkoff = forkoff; + xrep_tempfile_copyout_local(sc, XFS_ATTR_FORK); + return 0; + } + } + + /* Otherwise, make sure both attr forks are in block-mapping mode. */ + error = xrep_xattr_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + return xrep_tempexch_contents(sc, tx); +} + +/* + * Finish replaying stashed parent pointer updates, allocate a transaction for + * exchanging extent mappings, and take the ILOCKs of both files before we + * commit the new extended attribute structure. + */ +STATIC int +xrep_xattr_finalize_tempfile( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx); + + /* + * Repair relies on the ILOCK to quiesce all possible xattr updates. + * Replay all queued parent pointer updates into the tempfile before + * exchanging the contents, even if that means dropping the ILOCKs and + * the transaction. + */ + do { + error = xrep_xattr_replay_pptr_updates(rx); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rx->tx); + if (error) + return error; + + if (xfarray_length(rx->pptr_recs) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* + * Exchange the new extended attribute data (which we created in the tempfile) + * with the file being repaired. + */ +STATIC int +xrep_xattr_rebuild_tree( + struct xrep_xattr *rx) +{ + struct xfs_scrub *sc = rx->sc; + int error; + + /* + * If we didn't find any attributes to salvage, repair the file by + * zapping its attr fork. + */ + if (rx->attrs_found == 0) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + error = xrep_xattr_reset_fork(sc); + if (error) + return error; + + goto forget_acls; + } + + trace_xrep_xattr_rebuild_tree(sc->ip, sc->tempip); + + /* + * Commit the repair transaction and drop the ILOCKs so that we can use + * the atomic file content exchange helper functions to compute the + * correct resource reservations. + * + * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent xattr + * modifications, but there's nothing to prevent userspace from reading + * the attributes until we're ready for the exchange operation. Reads + * will return -EIO without shutting down the fs, so we're ok with + * that. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK on the temporary file so that we can run xattr + * operations with the same locks held as we would for a normal file. + * We still hold sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rx->sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed parent pointer updates to the temp file. After this + * point, we're ready to exchange attr fork mappings. + */ + error = xrep_xattr_finalize_tempfile(rx); + if (error) + return error; + + /* + * Exchange the blocks mapped by the tempfile's attr fork with the file + * being repaired. The old attr blocks will then be attached to the + * tempfile, so reap its attr fork. + */ + error = xrep_xattr_swap(sc, &rx->tx); + if (error) + return error; + + error = xrep_xattr_reset_tempfile_fork(sc); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target file. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + +forget_acls: + /* Invalidate cached ACLs now that we've reloaded all the xattrs. */ + xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_FILE); + xfs_forget_acl(VFS_I(sc->ip), SGI_ACL_DEFAULT); + return 0; +} + +/* Tear down all the incore scan stuff we created. */ +STATIC void +xrep_xattr_teardown( + struct xrep_xattr *rx) +{ + if (xfs_has_parent(rx->sc->mp)) + xfs_dir_hook_del(rx->sc->mp, &rx->dhook); + if (rx->pptr_names) + xfblob_destroy(rx->pptr_names); + if (rx->pptr_recs) + xfarray_destroy(rx->pptr_recs); + xfblob_destroy(rx->xattr_blobs); + xfarray_destroy(rx->xattr_records); + mutex_destroy(&rx->lock); + kfree(rx); +} + +/* Set up the filesystem scan so we can regenerate extended attributes. */ +STATIC int +xrep_xattr_setup_scan( + struct xfs_scrub *sc, + struct xrep_xattr **rxp) +{ + struct xrep_xattr *rx; + char *descr; + int max_len; + int error; + + rx = kzalloc(sizeof(struct xrep_xattr), XCHK_GFP_FLAGS); + if (!rx) + return -ENOMEM; + rx->sc = sc; + rx->can_flush = true; + rx->xname.name = rx->namebuf; + + mutex_init(&rx->lock); + + /* + * Allocate enough memory to handle loading local attr values from the + * xfblob data while flushing stashed attrs to the temporary file. + * We only realloc the buffer when salvaging remote attr values. + */ + max_len = xfs_attr_leaf_entsize_local_max(sc->mp->m_attr_geo->blksize); + error = xchk_setup_xattr_buf(rx->sc, max_len); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) + goto out_rx; + + /* Set up some staging for salvaged attribute keys and values */ + descr = xchk_xfile_ino_descr(sc, "xattr keys"); + error = xfarray_create(descr, 0, sizeof(struct xrep_xattr_key), + &rx->xattr_records); + kfree(descr); + if (error) + goto out_rx; + + descr = xchk_xfile_ino_descr(sc, "xattr names"); + error = xfblob_create(descr, &rx->xattr_blobs); + kfree(descr); + if (error) + goto out_keys; + + if (xfs_has_parent(sc->mp)) { + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + + descr = xchk_xfile_ino_descr(sc, + "xattr retained parent pointer entries"); + error = xfarray_create(descr, 0, + sizeof(struct xrep_xattr_pptr), + &rx->pptr_recs); + kfree(descr); + if (error) + goto out_values; + + descr = xchk_xfile_ino_descr(sc, + "xattr retained parent pointer names"); + error = xfblob_create(descr, &rx->pptr_names); + kfree(descr); + if (error) + goto out_pprecs; + + xfs_dir_hook_setup(&rx->dhook, xrep_xattr_live_dirent_update); + error = xfs_dir_hook_add(sc->mp, &rx->dhook); + if (error) + goto out_ppnames; + } + + *rxp = rx; + return 0; +out_ppnames: + xfblob_destroy(rx->pptr_names); +out_pprecs: + xfarray_destroy(rx->pptr_recs); +out_values: + xfblob_destroy(rx->xattr_blobs); +out_keys: + xfarray_destroy(rx->xattr_records); +out_rx: + mutex_destroy(&rx->lock); + kfree(rx); + return error; +} + +/* + * Repair the extended attribute metadata. + * + * XXX: Remote attribute value buffers encompass the entire (up to 64k) buffer. + * The buffer cache in XFS can't handle aliased multiblock buffers, so this + * might misbehave if the attr fork is crosslinked with other filesystem + * metadata. + */ +int +xrep_xattr( + struct xfs_scrub *sc) +{ + struct xrep_xattr *rx = NULL; + int error; + + if (!xfs_inode_hasattr(sc->ip)) + return -ENOENT; + + /* The rmapbt is required to reap the old attr fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + error = xrep_xattr_setup_scan(sc, &rx); + if (error) + return error; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xrep_xattr_salvage_attributes(rx); + if (error) + goto out_scan; + + if (rx->live_update_aborted) { + error = -EIO; + goto out_scan; + } + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_scan; + + error = xrep_xattr_rebuild_tree(rx); + if (error) + goto out_scan; + +out_scan: + xrep_xattr_teardown(rx); + return error; +} diff --git a/fs/xfs/scrub/attr_repair.h b/fs/xfs/scrub/attr_repair.h new file mode 100644 index 000000000000..979729bd4a5f --- /dev/null +++ b/fs/xfs/scrub/attr_repair.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ATTR_REPAIR_H__ +#define __XFS_SCRUB_ATTR_REPAIR_H__ + +struct xrep_tempexch; + +int xrep_xattr_swap(struct xfs_scrub *sc, struct xrep_tempexch *tx); +int xrep_xattr_reset_fork(struct xfs_scrub *sc); +int xrep_xattr_reset_tempfile_fork(struct xfs_scrub *sc); + +#endif /* __XFS_SCRUB_ATTR_REPAIR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 0cb8d43912a8..7ba35a7a7920 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -40,22 +40,23 @@ struct xbitmap64_node { * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll * forward-declare them anyway for clarity. */ -static inline void +static inline __maybe_unused void xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline void +static inline __maybe_unused void xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline struct xbitmap64_node * +static inline __maybe_unused struct xbitmap64_node * xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start, uint64_t last); -static inline struct xbitmap64_node * +static inline __maybe_unused struct xbitmap64_node * xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start, uint64_t last); INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t, - __bn_subtree_last, START, LAST, static inline, xbitmap64_tree) + __bn_subtree_last, START, LAST, static inline __maybe_unused, + xbitmap64_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ #define for_each_xbitmap64_extent(bn, bitmap) \ @@ -314,22 +315,23 @@ struct xbitmap32_node { * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll * forward-declare them anyway for clarity. */ -static inline void +static inline __maybe_unused void xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root); -static inline void +static inline __maybe_unused void xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root); -static inline struct xbitmap32_node * +static inline __maybe_unused struct xbitmap32_node * xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start, uint32_t last); -static inline struct xbitmap32_node * +static inline __maybe_unused struct xbitmap32_node * xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start, uint32_t last); INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t, - __bn_subtree_last, START, LAST, static inline, xbitmap32_tree) + __bn_subtree_last, START, LAST, static inline __maybe_unused, + xbitmap32_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ #define for_each_xbitmap32_extent(bn, bitmap) \ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 47a20cf5205f..1ad8ec63a7f4 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -31,6 +31,8 @@ #include "xfs_ag.h" #include "xfs_error.h" #include "xfs_quota.h" +#include "xfs_exchmaps.h" +#include "xfs_rtbitmap.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -445,7 +447,7 @@ xchk_perag_read_headers( { int error; - error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); + error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; @@ -781,7 +783,7 @@ xchk_iget( { ASSERT(sc->tp != NULL); - return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); + return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp); } /* @@ -827,13 +829,13 @@ again: * in the iget cache miss path. */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - error = xfs_ialloc_read_agi(pag, tp, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp); xfs_perag_put(pag); if (error) return error; - error = xfs_iget(mp, tp, inum, - XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); + error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0, + ipp); if (error == -EAGAIN) { /* * The inode may be in core but temporarily unavailable and may @@ -1060,12 +1062,6 @@ xchk_irele( spin_lock(&VFS_I(ip)->i_lock); VFS_I(ip)->i_state &= ~I_DONTCACHE; spin_unlock(&VFS_I(ip)->i_lock); - } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { - /* - * If this is the last reference to the inode and the caller - * permits it, set DONTCACHE to avoid thrashing. - */ - d_mark_dontcache(VFS_I(ip)); } xfs_irele(ip); @@ -1202,27 +1198,12 @@ xchk_metadata_inode_subtype( struct xfs_scrub *sc, unsigned int scrub_type) { - __u32 smtype = sc->sm->sm_type; - unsigned int sick_mask = sc->sick_mask; + struct xfs_scrub_subord *sub; int error; - sc->sm->sm_type = scrub_type; - - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xchk_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xchk_bmap_data(sc); - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - break; - } - - sc->sick_mask = sick_mask; - sc->sm->sm_type = smtype; + sub = xchk_scrub_create_subord(sc, scrub_type); + error = sub->sc.ops->scrub(&sub->sc); + xchk_scrub_free_subord(sub); return error; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 89f7bbec887e..3d5f1f6b4b7b 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -6,31 +6,6 @@ #ifndef __XFS_SCRUB_COMMON_H__ #define __XFS_SCRUB_COMMON_H__ -/* - * We /could/ terminate a scrub/repair operation early. If we're not - * in a good place to continue (fatal signal, etc.) then bail out. - * Note that we're careful not to make any judgements about *error. - */ -static inline bool -xchk_should_terminate( - struct xfs_scrub *sc, - int *error) -{ - /* - * If preemption is disabled, we need to yield to the scheduler every - * few seconds so that we don't run afoul of the soft lockup watchdog - * or RCU stall detector. - */ - cond_resched(); - - if (fatal_signal_pending(current)) { - if (*error == 0) - *error = -EINTR; - return true; - } - return false; -} - int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); int xchk_trans_alloc_empty(struct xfs_scrub *sc); void xchk_trans_cancel(struct xfs_scrub *sc); @@ -92,6 +67,7 @@ int xchk_setup_directory(struct xfs_scrub *sc); int xchk_setup_xattr(struct xfs_scrub *sc); int xchk_setup_symlink(struct xfs_scrub *sc); int xchk_setup_parent(struct xfs_scrub *sc); +int xchk_setup_dirtree(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_setup_rtbitmap(struct xfs_scrub *sc); int xchk_setup_rtsummary(struct xfs_scrub *sc); @@ -212,6 +188,7 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) } bool xchk_dir_looks_zapped(struct xfs_inode *dp); +bool xchk_pptr_looks_zapped(struct xfs_inode *ip); #ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ diff --git a/fs/xfs/scrub/dab_bitmap.h b/fs/xfs/scrub/dab_bitmap.h new file mode 100644 index 000000000000..0c6e3aad4395 --- /dev/null +++ b/fs/xfs/scrub/dab_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_DAB_BITMAP_H__ +#define __XFS_SCRUB_DAB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_dablk_t */ + +struct xdab_bitmap { + struct xbitmap32 dabitmap; +}; + +static inline void xdab_bitmap_init(struct xdab_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->dabitmap); +} + +static inline void xdab_bitmap_destroy(struct xdab_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->dabitmap); +} + +static inline int xdab_bitmap_set(struct xdab_bitmap *bitmap, + xfs_dablk_t dabno, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->dabitmap, dabno, len); +} + +static inline bool xdab_bitmap_test(struct xdab_bitmap *bitmap, + xfs_dablk_t dabno, xfs_extlen_t *len) +{ + return xbitmap32_test(&bitmap->dabitmap, dabno, len); +} + +#endif /* __XFS_SCRUB_DAB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 82b150d3b8b7..056de4819f86 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -78,6 +78,22 @@ xchk_da_set_corrupt( __return_address); } +/* Flag a da btree node in need of optimization. */ +void +xchk_da_set_preen( + struct xchk_da_btree *ds, + int level) +{ + struct xfs_scrub *sc = ds->sc; + + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xchk_fblock_preen(sc, ds->dargs.whichfork, + xfs_dir2_da_to_db(ds->dargs.geo, + ds->state->path.blk[level].blkno), + __return_address); +} + +/* Find an entry at a certain level in a da btree. */ static struct xfs_da_node_entry * xchk_da_btree_node_entry( struct xchk_da_btree *ds, @@ -320,6 +336,7 @@ xchk_da_btree_block( struct xfs_da3_blkinfo *hdr3; struct xfs_da_args *dargs = &ds->dargs; struct xfs_inode *ip = ds->dargs.dp; + xfs_failaddr_t fa; xfs_ino_t owner; int *pmaxrecs; struct xfs_da3_icnode_hdr nodehdr; @@ -442,6 +459,12 @@ xchk_da_btree_block( goto out_freebp; } + fa = xfs_da3_header_check(blk->bp, dargs->owner); + if (fa) { + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + /* * If we've been handed a block that is below the dabtree root, does * its hashval match what the parent block expected to see? @@ -494,6 +517,7 @@ xchk_da_btree( ds->dargs.whichfork = whichfork; ds->dargs.trans = sc->tp; ds->dargs.op_flags = XFS_DA_OP_OKNOENT; + ds->dargs.owner = sc->ip->i_ino; ds->state = xfs_da_state_alloc(&ds->dargs); ds->sc = sc; ds->private = private; diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h index 4f8c2138a1ec..de291e3b77dd 100644 --- a/fs/xfs/scrub/dabtree.h +++ b/fs/xfs/scrub/dabtree.h @@ -35,6 +35,9 @@ bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error); /* Check for da btree corruption. */ void xchk_da_set_corrupt(struct xchk_da_btree *ds, int level); +void xchk_da_set_preen(struct xchk_da_btree *ds, int level); + +void xchk_da_set_preen(struct xchk_da_btree *ds, int level); int xchk_da_btree_hash(struct xchk_da_btree *ds, int level, __be32 *hashp); int xchk_da_btree(struct xfs_scrub *sc, int whichfork, diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 076a310b8eb0..bf9199e8df63 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -16,22 +16,70 @@ #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_health.h" +#include "xfs_attr.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/readdir.h" #include "scrub/health.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" /* Set us up to scrub directories. */ int xchk_setup_directory( struct xfs_scrub *sc) { + int error; + + if (xchk_could_repair(sc)) { + error = xrep_setup_directory(sc); + if (error) + return error; + } + return xchk_setup_inode_contents(sc, 0); } /* Directories */ +/* Deferred directory entry that we saved for later. */ +struct xchk_dirent { + /* Cookie for retrieval of the dirent name. */ + xfblob_cookie name_cookie; + + /* Child inode number. */ + xfs_ino_t ino; + + /* Length of the pptr name. */ + uint8_t namelen; +}; + +struct xchk_dir { + struct xfs_scrub *sc; + + /* information for parent pointer validation. */ + struct xfs_parent_rec pptr_rec; + struct xfs_da_args pptr_args; + + /* Fixed-size array of xchk_dirent structures. */ + struct xfarray *dir_entries; + + /* Blobs containing dirent names. */ + struct xfblob *dir_names; + + /* If we've cycled the ILOCK, we must revalidate deferred dirents. */ + bool need_revalidate; + + /* Name buffer for dirent revalidation. */ + struct xfs_name xname; + uint8_t namebuf[MAXNAMELEN]; +}; + /* Scrub a directory entry. */ /* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */ @@ -55,6 +103,108 @@ xchk_dir_check_ftype( } /* + * Try to lock a child file for checking parent pointers. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_dir_lock_child( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return 0; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } + + if (!xfs_inode_has_attr_fork(ip) || !xfs_need_iread_extents(&ip->i_af)) + return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED; + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } + + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; +} + +/* Check the backwards link (parent pointer) associated with this dirent. */ +STATIC int +xchk_dir_parent_pointer( + struct xchk_dir *sd, + const struct xfs_name *name, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = sd->sc; + int error; + + xfs_inode_to_parent_rec(&sd->pptr_rec, sc->ip); + error = xfs_parent_lookup(sc->tp, ip, name, &sd->pptr_rec, + &sd->pptr_args); + if (error == -ENOATTR) + xchk_fblock_xref_set_corrupt(sc, XFS_DATA_FORK, 0); + + return 0; +} + +/* Look for a parent pointer matching this dirent, if the child isn't busy. */ +STATIC int +xchk_dir_check_pptr_fast( + struct xchk_dir *sd, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = sd->sc; + unsigned int lockmode; + int error; + + /* dot and dotdot entries do not have parent pointers */ + if (xfs_dir2_samename(name, &xfs_name_dot) || + xfs_dir2_samename(name, &xfs_name_dotdot)) + return 0; + + /* No self-referential non-dot or dotdot dirents. */ + if (ip == sc->ip) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return -ECANCELED; + } + + /* Try to lock the inode. */ + lockmode = xchk_dir_lock_child(sc, ip); + if (!lockmode) { + struct xchk_dirent save_de = { + .namelen = name->len, + .ino = ip->i_ino, + }; + + /* Couldn't lock the inode, so save the dirent for later. */ + trace_xchk_dir_defer(sc->ip, name, ip->i_ino); + + error = xfblob_storename(sd->dir_names, &save_de.name_cookie, + name); + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + + error = xfarray_append(sd->dir_entries, &save_de); + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + + return 0; + } + + error = xchk_dir_parent_pointer(sd, name, ip); + xfs_iunlock(ip, lockmode); + return error; +} + +/* * Scrub a single directory entry. * * Check the inode number to make sure it's sane, then we check that we can @@ -71,6 +221,7 @@ xchk_dir_actor( { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip; + struct xchk_dir *sd = priv; xfs_ino_t lookup_ino; xfs_dablk_t offset; int error = 0; @@ -137,6 +288,14 @@ xchk_dir_actor( goto out; xchk_dir_check_ftype(sc, offset, ip, name->type); + + if (xfs_has_parent(mp)) { + error = xchk_dir_check_pptr_fast(sd, dapos, name, ip); + if (error) + goto out_rele; + } + +out_rele: xchk_irele(sc, ip); out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -196,8 +355,8 @@ xchk_dir_rec( xchk_da_set_corrupt(ds, level); goto out; } - error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, - XFS_DABUF_MAP_HOLE_OK, &bp); + error = xfs_dir3_data_read(ds->dargs.trans, dp, ds->dargs.owner, + rec_bno, XFS_DABUF_MAP_HOLE_OK, &bp); if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno, &error)) goto out; @@ -315,10 +474,11 @@ xchk_directory_data_bestfree( /* dir block format */ if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); - error = xfs_dir3_block_read(sc->tp, sc->ip, &bp); + error = xfs_dir3_block_read(sc->tp, sc->ip, sc->ip->i_ino, &bp); } else { /* dir data format */ - error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp); + error = xfs_dir3_data_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, + 0, &bp); } if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; @@ -470,7 +630,7 @@ xchk_directory_leaf1_bestfree( int error; /* Read the free space block. */ - error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp); + error = xfs_dir3_leaf_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) return error; xchk_buffer_recheck(sc, bp); @@ -531,10 +691,9 @@ xchk_directory_leaf1_bestfree( /* Check all the bestfree entries. */ for (i = 0; i < bestcount; i++, bestp++) { best = be16_to_cpu(*bestp); - error = xfs_dir3_data_read(sc->tp, sc->ip, + error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner, xfs_dir2_db_to_da(args->geo, i), - XFS_DABUF_MAP_HOLE_OK, - &dbp); + XFS_DABUF_MAP_HOLE_OK, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) break; @@ -577,7 +736,7 @@ xchk_directory_free_bestfree( int error; /* Read the free space block */ - error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); + error = xfs_dir2_free_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) return error; xchk_buffer_recheck(sc, bp); @@ -597,7 +756,7 @@ xchk_directory_free_bestfree( stale++; continue; } - error = xfs_dir3_data_read(sc->tp, sc->ip, + error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner, (freehdr.firstdb + i) * args->geo->fsbcount, 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, @@ -621,10 +780,11 @@ xchk_directory_blocks( { struct xfs_bmbt_irec got; struct xfs_da_args args = { - .dp = sc ->ip, + .dp = sc->ip, .whichfork = XFS_DATA_FORK, .geo = sc->mp->m_dir_geo, .trans = sc->tp, + .owner = sc->ip->i_ino, }; struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; @@ -648,7 +808,8 @@ xchk_directory_blocks( free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); /* Is this a block dir? */ - error = xfs_dir2_isblock(&args, &is_block); + if (xfs_dir2_format(&args, &error) == XFS_DIR2_FMT_BLOCK) + is_block = true; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) goto out; @@ -752,11 +913,148 @@ out: return error; } +/* + * Revalidate a dirent that we collected in the past but couldn't check because + * of lock contention. Returns 0 if the dirent is still valid, -ENOENT if it + * has gone away on us, or a negative errno. + */ +STATIC int +xchk_dir_revalidate_dirent( + struct xchk_dir *sd, + const struct xfs_name *xname, + xfs_ino_t ino) +{ + struct xfs_scrub *sc = sd->sc; + xfs_ino_t child_ino; + int error; + + /* + * Look up the directory entry. If we get -ENOENT, the directory entry + * went away and there's nothing to revalidate. Return any other + * error. + */ + error = xchk_dir_lookup(sc, sc->ip, xname, &child_ino); + if (error) + return error; + + /* The inode number changed, nothing to revalidate. */ + if (ino != child_ino) + return -ENOENT; + + return 0; +} + +/* + * Check a directory entry's parent pointers the slow way, which means we cycle + * locks a bunch and put up with revalidation until we get it done. + */ +STATIC int +xchk_dir_slow_dirent( + struct xchk_dir *sd, + struct xchk_dirent *dirent, + const struct xfs_name *xname) +{ + struct xfs_scrub *sc = sd->sc; + struct xfs_inode *ip; + unsigned int lockmode; + int error; + + /* Check that the deferred dirent still exists. */ + if (sd->need_revalidate) { + error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino); + if (error == -ENOENT) + return 0; + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + } + + error = xchk_iget(sc, dirent->ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + + /* + * If we can grab both IOLOCK and ILOCK of the alleged child, we can + * proceed with the validation. + */ + lockmode = xchk_dir_lock_child(sc, ip); + if (lockmode) { + trace_xchk_dir_slowpath(sc->ip, xname, ip->i_ino); + goto check_pptr; + } + + /* + * We couldn't lock the child file. Drop all the locks and try to + * get them again, one at a time. + */ + xchk_iunlock(sc, sc->ilock_flags); + sd->need_revalidate = true; + + trace_xchk_dir_ultraslowpath(sc->ip, xname, ip->i_ino); + + error = xchk_dir_trylock_for_pptrs(sc, ip, &lockmode); + if (error) + goto out_rele; + + /* Revalidate, since we just cycled the locks. */ + error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino); + if (error == -ENOENT) { + error = 0; + goto out_unlock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_unlock; + +check_pptr: + error = xchk_dir_parent_pointer(sd, xname, ip); +out_unlock: + xfs_iunlock(ip, lockmode); +out_rele: + xchk_irele(sc, ip); + return error; +} + +/* Check all the dirents that we deferred the first time around. */ +STATIC int +xchk_dir_finish_slow_dirents( + struct xchk_dir *sd) +{ + xfarray_idx_t array_cur; + int error; + + foreach_xfarray_idx(sd->dir_entries, array_cur) { + struct xchk_dirent dirent; + + if (sd->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + error = xfarray_load(sd->dir_entries, array_cur, &dirent); + if (error) + return error; + + error = xfblob_loadname(sd->dir_names, dirent.name_cookie, + &sd->xname, dirent.namelen); + if (error) + return error; + + error = xchk_dir_slow_dirent(sd, &dirent, &sd->xname); + if (error) + return error; + } + + return 0; +} + /* Scrub a whole directory. */ int xchk_directory( struct xfs_scrub *sc) { + struct xchk_dir *sd; int error; if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) @@ -789,9 +1087,60 @@ xchk_directory( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return 0; + sd = kvzalloc(sizeof(struct xchk_dir), XCHK_GFP_FLAGS); + if (!sd) + return -ENOMEM; + sd->sc = sc; + sd->xname.name = sd->namebuf; + + if (xfs_has_parent(sc->mp)) { + char *descr; + + /* + * Set up some staging memory for dirents that we can't check + * due to locking contention. + */ + descr = xchk_xfile_ino_descr(sc, "slow directory entries"); + error = xfarray_create(descr, 0, sizeof(struct xchk_dirent), + &sd->dir_entries); + kfree(descr); + if (error) + goto out_sd; + + descr = xchk_xfile_ino_descr(sc, "slow directory entry names"); + error = xfblob_create(descr, &sd->dir_names); + kfree(descr); + if (error) + goto out_entries; + } + /* Look up every name in this directory by hash. */ - error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); - if (error && error != -ECANCELED) + error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, sd); + if (error == -ECANCELED) + error = 0; + if (error) + goto out_names; + + if (xfs_has_parent(sc->mp)) { + error = xchk_dir_finish_slow_dirents(sd); + if (error == -ETIMEDOUT) { + /* Couldn't grab a lock, scrub was marked incomplete */ + error = 0; + goto out_names; + } + if (error) + goto out_names; + } + +out_names: + if (sd->dir_names) + xfblob_destroy(sd->dir_names); +out_entries: + if (sd->dir_entries) + xfarray_destroy(sd->dir_entries); +out_sd: + kvfree(sd); + if (error) return error; /* If the dir is clean, it is clearly not zapped. */ diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c new file mode 100644 index 000000000000..64679fe08446 --- /dev/null +++ b/fs/xfs/scrub/dir_repair.c @@ -0,0 +1,1958 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_bmap_util.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_ag.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/iscan.h" +#include "scrub/readdir.h" +#include "scrub/reap.h" +#include "scrub/findparent.h" +#include "scrub/orphanage.h" +#include "scrub/listxattr.h" + +/* + * Directory Repair + * ================ + * + * We repair directories by reading the directory data blocks looking for + * directory entries that look salvageable (name passes verifiers, entry points + * to a valid allocated inode, etc). Each entry worth salvaging is stashed in + * memory, and the stashed entries are periodically replayed into a temporary + * directory to constrain memory use. Batching the construction of the + * temporary directory in this fashion reduces lock cycling of the directory + * being repaired and the temporary directory, and will later become important + * for parent pointer scanning. + * + * If parent pointers are enabled on this filesystem, we instead reconstruct + * the directory by visiting each parent pointer of each file in the filesystem + * and translating the relevant parent pointer records into dirents. In this + * case, it is advantageous to stash all directory entries created from parent + * pointers for a single child file before replaying them into the temporary + * directory. To save memory, the live filesystem scan reuses the findparent + * fields. Directory repair chooses either parent pointer scanning or + * directory entry salvaging, but not both. + * + * Directory entries added to the temporary directory do not elevate the link + * counts of the inodes found. When salvaging completes, the remaining stashed + * entries are replayed to the temporary directory. An atomic mapping exchange + * is used to commit the new directory blocks to the directory being repaired. + * This will disrupt readdir cursors. + * + * Locking Issues + * -------------- + * + * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on + * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects + * b's dotdot update. This is in contrast to every other dotdot update (link, + * remove, mkdir). If the repair code drops the ILOCK, it must either + * revalidate the dotdot entry or use dirent hooks to capture updates from + * other threads. + */ + +/* Create a dirent in the tempdir. */ +#define XREP_DIRENT_ADD (1) + +/* Remove a dirent from the tempdir. */ +#define XREP_DIRENT_REMOVE (2) + +/* Directory entry to be restored in the new directory. */ +struct xrep_dirent { + /* Cookie for retrieval of the dirent name. */ + xfblob_cookie name_cookie; + + /* Target inode number. */ + xfs_ino_t ino; + + /* Length of the dirent name. */ + uint8_t namelen; + + /* File type of the dirent. */ + uint8_t ftype; + + /* XREP_DIRENT_{ADD,REMOVE} */ + uint8_t action; +}; + +/* + * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names + * before we write them to the temp dir. + */ +#define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_dir { + struct xfs_scrub *sc; + + /* Fixed-size array of xrep_dirent structures. */ + struct xfarray *dir_entries; + + /* Blobs containing directory entry names. */ + struct xfblob *dir_names; + + /* Information for exchanging data forks at the end. */ + struct xrep_tempexch tx; + + /* Preallocated args struct for performing dir operations */ + struct xfs_da_args args; + + /* + * Information used to scan the filesystem to find the inumber of the + * dotdot entry for this directory. For directory salvaging when + * parent pointers are not enabled, we use the findparent_* functions + * on this object and access only the parent_ino field directly. + * + * When parent pointers are enabled, however, the pptr scanner uses the + * iscan, hooks, lock, and parent_ino fields of this object directly. + * @pscan.lock coordinates access to dir_entries, dir_names, + * parent_ino, subdirs, dirents, and args. This reduces the memory + * requirements of this structure. + */ + struct xrep_parent_scan_info pscan; + + /* + * Context information for attaching this directory to the lost+found + * if this directory does not have a parent. + */ + struct xrep_adoption adoption; + + /* How many subdirectories did we find? */ + uint64_t subdirs; + + /* How many dirents did we find? */ + unsigned int dirents; + + /* Should we move this directory to the orphanage? */ + bool needs_adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + unsigned char namebuf[MAXNAMELEN]; +}; + +/* Tear down all the incore stuff we created. */ +static void +xrep_dir_teardown( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd = sc->buf; + + xrep_findparent_scan_teardown(&rd->pscan); + xfblob_destroy(rd->dir_names); + xfarray_destroy(rd->dir_entries); +} + +/* Set up for a directory repair. */ +int +xrep_setup_directory( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + error = xrep_orphanage_try_create(sc); + if (error) + return error; + + error = xrep_tempfile_create(sc, S_IFDIR); + if (error) + return error; + + rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS); + if (!rd) + return -ENOMEM; + rd->sc = sc; + rd->xname.name = rd->namebuf; + sc->buf = rd; + + return 0; +} + +/* + * Look up the dotdot entry and confirm that it's really the parent. + * Returns NULLFSINO if we don't know what to do. + */ +static inline xfs_ino_t +xrep_dir_lookup_parent( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t ino; + int error; + + error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL); + if (error) + return NULLFSINO; + if (!xfs_verify_dir_ino(sc->mp, ino)) + return NULLFSINO; + + error = xrep_findparent_confirm(sc, &ino); + if (error) + return NULLFSINO; + + return ino; +} + +/* + * Look up '..' in the dentry cache and confirm that it's really the parent. + * Returns NULLFSINO if the dcache misses or if the hit is implausible. + */ +static inline xfs_ino_t +xrep_dir_dcache_parent( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t parent_ino; + int error; + + parent_ino = xrep_findparent_from_dcache(sc); + if (parent_ino == NULLFSINO) + return parent_ino; + + error = xrep_findparent_confirm(sc, &parent_ino); + if (error) + return NULLFSINO; + + return parent_ino; +} + +/* Try to find the parent of the directory being repaired. */ +STATIC int +xrep_dir_find_parent( + struct xrep_dir *rd) +{ + xfs_ino_t ino; + + ino = xrep_findparent_self_reference(rd->sc); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + ino = xrep_dir_dcache_parent(rd); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + ino = xrep_dir_lookup_parent(rd); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rd->pscan, ino); + return 0; + } + + /* + * A full filesystem scan is the last resort. On a busy filesystem, + * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means + * that we don't know what who the parent is, so we should return to + * userspace. + */ + return xrep_findparent_scan(&rd->pscan); +} + +/* + * Decide if we want to salvage this entry. We don't bother with oversized + * names or the dot entry. + */ +STATIC int +xrep_dir_want_salvage( + struct xrep_dir *rd, + const char *name, + int namelen, + xfs_ino_t ino) +{ + struct xfs_mount *mp = rd->sc->mp; + + /* No pointers to ourselves or to garbage. */ + if (ino == rd->sc->ip->i_ino) + return false; + if (!xfs_verify_dir_ino(mp, ino)) + return false; + + /* No weird looking names or dot entries. */ + if (namelen >= MAXNAMELEN || namelen <= 0) + return false; + if (namelen == 1 && name[0] == '.') + return false; + if (!xfs_dir2_namecheck(name, namelen)) + return false; + + return true; +} + +/* + * Remember that we want to create a dirent in the tempdir. These stashed + * actions will be replayed later. + */ +STATIC int +xrep_dir_stash_createname( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t ino) +{ + struct xrep_dirent dirent = { + .action = XREP_DIRENT_ADD, + .ino = ino, + .namelen = name->len, + .ftype = name->type, + }; + int error; + + trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino); + + error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); + if (error) + return error; + + return xfarray_append(rd->dir_entries, &dirent); +} + +/* + * Remember that we want to remove a dirent from the tempdir. These stashed + * actions will be replayed later. + */ +STATIC int +xrep_dir_stash_removename( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t ino) +{ + struct xrep_dirent dirent = { + .action = XREP_DIRENT_REMOVE, + .ino = ino, + .namelen = name->len, + .ftype = name->type, + }; + int error; + + trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino); + + error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); + if (error) + return error; + + return xfarray_append(rd->dir_entries, &dirent); +} + +/* Allocate an in-core record to hold entries while we rebuild the dir data. */ +STATIC int +xrep_dir_salvage_entry( + struct xrep_dir *rd, + unsigned char *name, + unsigned int namelen, + xfs_ino_t ino) +{ + struct xfs_name xname = { + .name = name, + }; + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *ip; + unsigned int i = 0; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Truncate the name to the first character that would trip namecheck. + * If we no longer have a name after that, ignore this entry. + */ + while (i < namelen && name[i] != 0 && name[i] != '/') + i++; + if (i == 0) + return 0; + xname.len = i; + + /* Ignore '..' entries; we already picked the new parent. */ + if (xname.len == 2 && name[0] == '.' && name[1] == '.') { + trace_xrep_dir_salvaged_parent(sc->ip, ino); + return 0; + } + + trace_xrep_dir_salvage_entry(sc->ip, &xname, ino); + + /* + * Compute the ftype or dump the entry if we can't. We don't lock the + * inode because inodes can't change type while we have a reference. + */ + error = xchk_iget(sc, ino, &ip); + if (error) + return 0; + + xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + xchk_irele(sc, ip); + + return xrep_dir_stash_createname(rd, &xname, ino); +} + +/* Record a shortform directory entry for later reinsertion. */ +STATIC int +xrep_dir_salvage_sf_entry( + struct xrep_dir *rd, + struct xfs_dir2_sf_hdr *sfp, + struct xfs_dir2_sf_entry *sfep) +{ + xfs_ino_t ino; + + ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep); + if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino)) + return 0; + + return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino); +} + +/* Record a regular directory entry for later reinsertion. */ +STATIC int +xrep_dir_salvage_data_entry( + struct xrep_dir *rd, + struct xfs_dir2_data_entry *dep) +{ + xfs_ino_t ino; + + ino = be64_to_cpu(dep->inumber); + if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino)) + return 0; + + return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino); +} + +/* Try to recover block/data format directory entries. */ +STATIC int +xrep_dir_recover_data( + struct xrep_dir *rd, + struct xfs_buf *bp) +{ + struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo; + unsigned int offset; + unsigned int end; + int error = 0; + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + offset = geo->data_entry_offset; + end = min_t(unsigned int, BBTOB(bp->b_length), + xfs_dir3_data_end_offset(geo, bp->b_addr)); + + while (offset < end) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + + if (xchk_should_terminate(rd->sc, &error)) + return error; + + /* Skip unused entries. */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + offset += be16_to_cpu(dup->length); + continue; + } + + /* Don't walk off the end of the block. */ + offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen); + if (offset > end) + break; + + /* Ok, let's save this entry. */ + error = xrep_dir_salvage_data_entry(rd, dep); + if (error) + return error; + + } + + return 0; +} + +/* Try to recover shortform directory entries. */ +STATIC int +xrep_dir_recover_sf( + struct xrep_dir *rd) +{ + struct xfs_dir2_sf_hdr *hdr; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next; + struct xfs_ifork *ifp; + xfs_ino_t ino; + unsigned char *end; + int error = 0; + + ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK); + hdr = ifp->if_data; + end = (unsigned char *)ifp->if_data + ifp->if_bytes; + + ino = xfs_dir2_sf_get_parent_ino(hdr); + trace_xrep_dir_salvaged_parent(rd->sc->ip, ino); + + sfep = xfs_dir2_sf_firstentry(hdr); + while ((unsigned char *)sfep < end) { + if (xchk_should_terminate(rd->sc, &error)) + return error; + + next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep); + if ((unsigned char *)next > end) + break; + + /* Ok, let's save this entry. */ + error = xrep_dir_salvage_sf_entry(rd, hdr, sfep); + if (error) + return error; + + sfep = next; + } + + return 0; +} + +/* + * Try to figure out the format of this directory from the data fork mappings + * and the directory size. If we can be reasonably sure of format, we can be + * more aggressive in salvaging directory entries. On return, @magic_guess + * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format" + * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory, + * and 0 if we can't tell. + */ +STATIC void +xrep_dir_guess_format( + struct xrep_dir *rd, + __be32 *magic_guess) +{ + struct xfs_inode *dp = rd->sc->ip; + struct xfs_mount *mp = rd->sc->mp; + struct xfs_da_geometry *geo = mp->m_dir_geo; + xfs_fileoff_t last; + int error; + + ASSERT(xfs_has_crc(mp)); + + *magic_guess = 0; + + /* + * If there's a single directory block and the directory size is + * exactly one block, this has to be a single block format directory. + */ + error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK); + if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize && + dp->i_disk_size == geo->blksize) { + *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + return; + } + + /* + * If the last extent before the leaf offset matches the directory + * size and the directory size is larger than 1 block, this is a + * data format directory. + */ + last = geo->leafblk; + error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK); + if (!error && + XFS_FSB_TO_B(mp, last) > geo->blksize && + XFS_FSB_TO_B(mp, last) == dp->i_disk_size) { + *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + return; + } +} + +/* Recover directory entries from a specific directory block. */ +STATIC int +xrep_dir_recover_dirblock( + struct xrep_dir *rd, + __be32 magic_guess, + xfs_dablk_t dabno) +{ + struct xfs_dir2_data_hdr *hdr; + struct xfs_buf *bp; + __be32 oldmagic; + int error; + + /* + * Try to read buffer. We invalidate them in the next step so we don't + * bother to set a buffer type or ops. + */ + error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno, + XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL); + if (error || !bp) + return error; + + hdr = bp->b_addr; + oldmagic = hdr->magic; + + trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno, + be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess)); + + /* + * If we're sure of the block's format, proceed with the salvage + * operation using the specified magic number. + */ + if (magic_guess) { + hdr->magic = magic_guess; + goto recover; + } + + /* + * If we couldn't guess what type of directory this is, then we will + * only salvage entries from directory blocks that match the magic + * number and pass verifiers. + */ + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops)) + goto out; + if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL) + goto out; + break; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops)) + goto out; + if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL) + goto out; + break; + default: + goto out; + } + +recover: + error = xrep_dir_recover_data(rd, bp); + +out: + hdr->magic = oldmagic; + xfs_trans_brelse(rd->sc->tp, bp); + return error; +} + +static inline void +xrep_dir_init_args( + struct xrep_dir *rd, + struct xfs_inode *dp, + const struct xfs_name *name) +{ + memset(&rd->args, 0, sizeof(struct xfs_da_args)); + rd->args.geo = rd->sc->mp->m_dir_geo; + rd->args.whichfork = XFS_DATA_FORK; + rd->args.owner = rd->sc->ip->i_ino; + rd->args.trans = rd->sc->tp; + rd->args.dp = dp; + if (!name) + return; + rd->args.name = name->name; + rd->args.namelen = name->len; + rd->args.filetype = name->type; + rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name); +} + +/* Replay a stashed createname into the temporary directory. */ +STATIC int +xrep_dir_replay_createname( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_ino_t inum, + xfs_extlen_t total) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *dp = rd->sc->tempip; + int error; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + error = xfs_dir_ino_validate(sc->mp, inum); + if (error) + return error; + + trace_xrep_dir_replay_createname(dp, name, inum); + + xrep_dir_init_args(rd, dp, name); + rd->args.inumber = inum; + rd->args.total = total; + rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + return xfs_dir_createname_args(&rd->args); +} + +/* Replay a stashed removename onto the temporary directory. */ +STATIC int +xrep_dir_replay_removename( + struct xrep_dir *rd, + const struct xfs_name *name, + xfs_extlen_t total) +{ + struct xfs_inode *dp = rd->args.dp; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + xrep_dir_init_args(rd, dp, name); + rd->args.op_flags = 0; + rd->args.total = total; + + trace_xrep_dir_replay_removename(dp, name, 0); + return xfs_dir_removename_args(&rd->args); +} + +/* + * Add this stashed incore directory entry to the temporary directory. + * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and + * must not be in transaction context. + */ +STATIC int +xrep_dir_replay_update( + struct xrep_dir *rd, + const struct xfs_name *xname, + const struct xrep_dirent *dirent) +{ + struct xfs_mount *mp = rd->sc->mp; +#ifdef DEBUG + xfs_ino_t ino; +#endif + uint resblks; + int error; + + resblks = xfs_link_space_res(mp, xname->len); + error = xchk_trans_alloc(rd->sc, resblks); + if (error) + return error; + + /* Lock the temporary directory and join it to the transaction */ + xrep_tempfile_ilock(rd->sc); + xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0); + + switch (dirent->action) { + case XREP_DIRENT_ADD: + /* + * Create a replacement dirent in the temporary directory. + * Note that _createname doesn't check for existing entries. + * There shouldn't be any in the temporary dir, but we'll + * verify this in debug mode. + */ +#ifdef DEBUG + error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); + if (error != -ENOENT) { + ASSERT(error != -ENOENT); + goto out_cancel; + } +#endif + + error = xrep_dir_replay_createname(rd, xname, dirent->ino, + resblks); + if (error) + goto out_cancel; + + if (xname->type == XFS_DIR3_FT_DIR) + rd->subdirs++; + rd->dirents++; + break; + case XREP_DIRENT_REMOVE: + /* + * Remove a dirent from the temporary directory. Note that + * _removename doesn't check the inode target of the exist + * entry. There should be a perfect match in the temporary + * dir, but we'll verify this in debug mode. + */ +#ifdef DEBUG + error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); + if (error) { + ASSERT(error != 0); + goto out_cancel; + } + if (ino != dirent->ino) { + ASSERT(ino == dirent->ino); + error = -EIO; + goto out_cancel; + } +#endif + + error = xrep_dir_replay_removename(rd, xname, resblks); + if (error) + goto out_cancel; + + if (xname->type == XFS_DIR3_FT_DIR) + rd->subdirs--; + rd->dirents--; + break; + default: + ASSERT(0); + error = -EIO; + goto out_cancel; + } + + /* Commit and unlock. */ + error = xrep_trans_commit(rd->sc); + if (error) + return error; + + xrep_tempfile_iunlock(rd->sc); + return 0; +out_cancel: + xchk_trans_cancel(rd->sc); + xrep_tempfile_iunlock(rd->sc); + return error; +} + +/* + * Flush stashed incore dirent updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the directory rebuild, + * since directories can contain up to 32GB of directory data. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir + * IOLOCK. + */ +STATIC int +xrep_dir_replay_updates( + struct xrep_dir *rd) +{ + xfarray_idx_t array_cur; + int error; + + /* Add all the salvaged dirents to the temporary directory. */ + mutex_lock(&rd->pscan.lock); + foreach_xfarray_idx(rd->dir_entries, array_cur) { + struct xrep_dirent dirent; + + error = xfarray_load(rd->dir_entries, array_cur, &dirent); + if (error) + goto out_unlock; + + error = xfblob_loadname(rd->dir_names, dirent.name_cookie, + &rd->xname, dirent.namelen); + if (error) + goto out_unlock; + rd->xname.type = dirent.ftype; + mutex_unlock(&rd->pscan.lock); + + error = xrep_dir_replay_update(rd, &rd->xname, &dirent); + if (error) + return error; + mutex_lock(&rd->pscan.lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rd->dir_entries); + xfblob_truncate(rd->dir_names); + mutex_unlock(&rd->pscan.lock); + return 0; +out_unlock: + mutex_unlock(&rd->pscan.lock); + return error; +} + +/* + * Periodically flush stashed directory entries to the temporary dir. This + * is done to reduce the memory requirements of the directory rebuild, since + * directories can contain up to 32GB of directory data. + */ +STATIC int +xrep_dir_flush_stashed( + struct xrep_dir *rd) +{ + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and a scrub transaction + * that we use during dirent salvaging to avoid livelocking if there + * are cycles in the directory structures. We hold ILOCK_EXCL on both + * the inode being repaired and the temporary file, though they are + * not ijoined to the scrub transaction. + * + * To constrain kernel memory use, we occasionally write salvaged + * dirents from the xfarray and xfblob structures into the temporary + * directory in preparation for exchanging the directory structures at + * the end. Updating the temporary file requires a transaction, so we + * commit the scrub transaction and drop the two ILOCKs so that + * we can allocate whatever transaction we want. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from accessing the damaged directory data while we + * repair it. + */ + error = xrep_trans_commit(rd->sc); + if (error) + return error; + xchk_iunlock(rd->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify dirents. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rd->sc); + if (error) + return error; + + /* Write to the tempdir all the updates that we've stashed. */ + error = xrep_dir_replay_updates(rd); + xrep_tempfile_iounlock(rd->sc); + if (error) + return error; + + /* + * Recreate the salvage transaction and relock the dir we're salvaging. + */ + error = xchk_trans_alloc(rd->sc, 0); + if (error) + return error; + xchk_ilock(rd->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much dirent data in memory. */ +static inline bool +xrep_dir_want_flush_stashed( + struct xrep_dir *rd) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names); + return bytes > XREP_DIR_MAX_STASH_BYTES; +} + +/* Extract as many directory entries as we can. */ +STATIC int +xrep_dir_recover( + struct xrep_dir *rd) +{ + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = rd->sc; + struct xfs_da_geometry *geo = sc->mp->m_dir_geo; + xfs_fileoff_t offset; + xfs_dablk_t dabno; + __be32 magic_guess; + int nmap; + int error; + + xrep_dir_guess_format(rd, &magic_guess); + + /* Iterate each directory data block in the data fork. */ + for (offset = 0; + offset < geo->leafblk; + offset = got.br_startoff + got.br_blockcount) { + nmap = 1; + error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset, + &got, &nmap, 0); + if (error) + return error; + if (nmap != 1) + return -EFSCORRUPTED; + if (!xfs_bmap_is_written_extent(&got)) + continue; + + for (dabno = round_up(got.br_startoff, geo->fsbcount); + dabno < got.br_startoff + got.br_blockcount; + dabno += geo->fsbcount) { + if (xchk_should_terminate(rd->sc, &error)) + return error; + + error = xrep_dir_recover_dirblock(rd, + magic_guess, dabno); + if (error) + return error; + + /* Flush dirents to constrain memory usage. */ + if (xrep_dir_want_flush_stashed(rd)) { + error = xrep_dir_flush_stashed(rd); + if (error) + return error; + } + } + } + + return 0; +} + +/* + * Find all the directory entries for this inode by scraping them out of the + * directory leaf blocks by hand, and flushing them into the temp dir. + */ +STATIC int +xrep_dir_find_entries( + struct xrep_dir *rd) +{ + struct xfs_inode *dp = rd->sc->ip; + int error; + + /* + * Salvage directory entries from the old directory, and write them to + * the temporary directory. + */ + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + error = xrep_dir_recover_sf(rd); + } else { + error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xrep_dir_recover(rd); + } + if (error) + return error; + + return xrep_dir_flush_stashed(rd); +} + +/* Scan all files in the filesystem for dirents. */ +STATIC int +xrep_dir_salvage_entries( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + /* + * Drop the ILOCK on this directory so that we can scan for this + * directory's parent. Figure out who is going to be the parent of + * this directory, then retake the ILOCK so that we can salvage + * directory entries. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + error = xrep_dir_find_parent(rd); + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (error) + return error; + + /* + * Collect directory entries by parsing raw leaf blocks to salvage + * whatever we can. When we're done, free the staging memory before + * exchanging the directories to reduce memory usage. + */ + error = xrep_dir_find_entries(rd); + if (error) + return error; + + /* + * Cancel the repair transaction and drop the ILOCK so that we can + * (later) use the atomic mapping exchange functions to compute the + * correct block reservations and re-lock the inodes. + * + * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory + * modifications, but there's nothing to prevent userspace from reading + * the directory until we're ready for the exchange operation. Reads + * will return -EIO without shutting down the fs, so we're ok with + * that. + * + * The VFS can change dotdot on us, but the findparent scan will keep + * our incore parent inode up to date. See the note on locking issues + * for more details. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + return 0; +} + + +/* + * Examine a parent pointer of a file. If it leads us back to the directory + * that we're rebuilding, create an incore dirent from the parent pointer and + * stash it. + */ +STATIC int +xrep_dir_scan_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode), + }; + xfs_ino_t parent_ino; + uint32_t parent_gen; + struct xrep_dir *rd = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + /* + * Ignore parent pointers that point back to a different dir, list the + * wrong generation number, or are invalid. + */ + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, &parent_gen); + if (error) + return error; + + if (parent_ino != sc->ip->i_ino || + parent_gen != VFS_I(sc->ip)->i_generation) + return 0; + + mutex_lock(&rd->pscan.lock); + error = xrep_dir_stash_createname(rd, &xname, ip->i_ino); + mutex_unlock(&rd->pscan.lock); + return error; +} + +/* + * If this child dirent points to the directory being repaired, remember that + * fact so that we can reset the dotdot entry if necessary. + */ +STATIC int +xrep_dir_scan_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_dir *rd = priv; + + /* Dirent doesn't point to this directory. */ + if (ino != rd->sc->ip->i_ino) + return 0; + + /* Ignore garbage inum. */ + if (!xfs_verify_dir_ino(rd->sc->mp, ino)) + return 0; + + /* No weird looking names. */ + if (name->len >= MAXNAMELEN || name->len <= 0) + return 0; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot, + dp->i_ino); + + xrep_findparent_scan_found(&rd->pscan, dp->i_ino); + return 0; +} + +/* + * Decide if we want to look for child dirents or parent pointers in this file. + * Skip the dir being repaired and any files being used to stage repairs. + */ +static inline bool +xrep_dir_want_scan( + struct xrep_dir *rd, + const struct xfs_inode *ip) +{ + return ip != rd->sc->ip && !xrep_is_tempfile(ip); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or + * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_dir_scan_ilock( + struct xrep_dir *rd, + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* Need to take the shared ILOCK to advance the iscan cursor. */ + if (!xrep_dir_want_scan(rd, ip)) + goto lock; + + if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* + * Scan this file for relevant child dirents or parent pointers that point to + * the directory we're rebuilding. + */ +STATIC int +xrep_dir_scan_file( + struct xrep_dir *rd, + struct xfs_inode *ip) +{ + unsigned int lock_mode; + int error = 0; + + lock_mode = xrep_dir_scan_ilock(rd, ip); + + if (!xrep_dir_want_scan(rd, ip)) + goto scan_done; + + /* + * If the extended attributes look as though they has been zapped by + * the inode record repair code, we cannot scan for parent pointers. + */ + if (xchk_pptr_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd); + if (error) + goto scan_done; + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + /* + * If the directory looks as though it has been zapped by the + * inode record repair code, we cannot scan for child dirents. + */ + if (xchk_dir_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd); + if (error) + goto scan_done; + } + +scan_done: + xchk_iscan_mark_visited(&rd->pscan.iscan, ip); + xfs_iunlock(ip, lock_mode); + return error; +} + +/* + * Scan all files in the filesystem for parent pointers that we can turn into + * replacement dirents, and a dirent that we can use to set the dotdot pointer. + */ +STATIC int +xrep_dir_scan_dirtree( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *ip; + int error; + + /* Roots of directory trees are their own parents. */ + if (sc->ip == sc->mp->m_rootip) + xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino); + + /* + * Filesystem scans are time consuming. Drop the directory ILOCK and + * all other resources for the duration of the scan and hope for the + * best. The live update hooks will keep our scan information up to + * date even though we've dropped the locks. + */ + xchk_trans_cancel(sc); + if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) + xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | + XFS_ILOCK_EXCL)); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) { + bool flush; + + error = xrep_dir_scan_file(rd, ip); + xchk_irele(sc, ip); + if (error) + break; + + /* Flush stashed dirent updates to constrain memory usage. */ + mutex_lock(&rd->pscan.lock); + flush = xrep_dir_want_flush_stashed(rd); + mutex_unlock(&rd->pscan.lock); + if (flush) { + xchk_trans_cancel(sc); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + break; + + error = xrep_dir_replay_updates(rd); + xrep_tempfile_iounlock(sc); + if (error) + break; + + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rd->pscan.iscan); + if (error) { + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Cancel the empty transaction so that we can (later) use the atomic + * file mapping exchange functions to lock files and commit the new + * directory. + */ + xchk_trans_cancel(rd->sc); + return 0; +} + +/* + * Capture dirent updates being made by other threads which are relevant to the + * directory being repaired. + */ +STATIC int +xrep_dir_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_dir *rd; + struct xfs_scrub *sc; + int error = 0; + + rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb); + sc = rd->sc; + + /* + * This thread updated a child dirent in the directory that we're + * rebuilding. Stash the update for replay against the temporary + * directory. + */ + if (p->dp->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) { + mutex_lock(&rd->pscan.lock); + if (p->delta > 0) + error = xrep_dir_stash_createname(rd, p->name, + p->ip->i_ino); + else + error = xrep_dir_stash_removename(rd, p->name, + p->ip->i_ino); + mutex_unlock(&rd->pscan.lock); + if (error) + goto out_abort; + } + + /* + * This thread updated another directory's child dirent that points to + * the directory that we're rebuilding, so remember the new dotdot + * target. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) { + if (p->delta > 0) { + trace_xrep_dir_stash_createname(sc->tempip, + &xfs_name_dotdot, + p->dp->i_ino); + + xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino); + } else { + trace_xrep_dir_stash_removename(sc->tempip, + &xfs_name_dotdot, + rd->pscan.parent_ino); + + xrep_findparent_scan_found(&rd->pscan, NULLFSINO); + } + } + + return NOTIFY_DONE; +out_abort: + xchk_iscan_abort(&rd->pscan.iscan); + return NOTIFY_DONE; +} + +/* + * Free all the directory blocks and reset the data fork. The caller must + * join the inode to the transaction. This function returns with the inode + * joined to a clean scrub transaction. + */ +STATIC int +xrep_dir_reset_fork( + struct xrep_dir *rd, + xfs_ino_t parent_ino) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); + int error; + + /* Unmap all the directory buffers. */ + if (xfs_ifork_has_extents(ifp)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); + if (error) + return error; + } + + trace_xrep_dir_reset_fork(sc->tempip, parent_ino); + + /* Reset the data fork to an empty data fork. */ + xfs_idestroy_fork(ifp); + ifp->if_bytes = 0; + sc->tempip->i_disk_size = 0; + + /* Reinitialize the short form directory. */ + xrep_dir_init_args(rd, sc->tempip, NULL); + return xfs_dir2_sf_create(&rd->args, parent_ino); +} + +/* + * Prepare both inodes' directory forks for exchanging mappings. Promote the + * tempfile from short format to leaf format, and if the file being repaired + * has a short format data fork, turn it into an empty extent list. + */ +STATIC int +xrep_dir_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the tempfile's directory is in shortform format, convert that to + * a single leaf extent so that we can use the atomic mapping exchange. + */ + if (temp_local) { + struct xfs_da_args args = { + .dp = sc->tempip, + .geo = sc->mp->m_dir_geo, + .whichfork = XFS_DATA_FORK, + .trans = sc->tp, + .total = 1, + .owner = sc->ip->i_ino, + }; + + error = xfs_dir2_sf_to_block(&args); + if (error) + return error; + + /* + * Roll the deferred log items to get us back to a clean + * transaction. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform data fork, convert that + * to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_DDATA); + } + + return 0; +} + +/* + * Replace the inode number of a directory entry. + */ +static int +xrep_dir_replace( + struct xrep_dir *rd, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t inum, + xfs_extlen_t total) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + + error = xfs_dir_ino_validate(sc->mp, inum); + if (error) + return error; + + xrep_dir_init_args(rd, dp, name); + rd->args.inumber = inum; + rd->args.total = total; + return xfs_dir_replace_args(&rd->args); +} + +/* + * Reset the link count of this directory and adjust the unlinked list pointers + * as needed. + */ +STATIC int +xrep_dir_set_nlink( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + struct xfs_inode *dp = sc->ip; + struct xfs_perag *pag; + unsigned int new_nlink = min_t(unsigned long long, + rd->subdirs + 2, + XFS_NLINK_PINNED); + int error; + + /* + * The directory is not on the incore unlinked list, which means that + * it needs to be reachable via the directory tree. Update the nlink + * with our observed link count. If the directory has no parent, it + * will be moved to the orphanage. + */ + if (!xfs_inode_on_unlinked_list(dp)) + goto reset_nlink; + + /* + * The directory is on the unlinked list and we did not find any + * dirents. Set the link count to zero and let the directory + * inactivate when the last reference drops. + */ + if (rd->dirents == 0) { + rd->needs_adoption = false; + new_nlink = 0; + goto reset_nlink; + } + + /* + * The directory is on the unlinked list and we found dirents. This + * directory needs to be reachable via the directory tree. Remove the + * dir from the unlinked list and update nlink with the observed link + * count. If the directory has no parent, it will be moved to the + * orphanage. + */ + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino)); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_remove(sc->tp, pag, dp); + xfs_perag_put(pag); + if (error) + return error; + +reset_nlink: + if (VFS_I(dp)->i_nlink != new_nlink) + set_nlink(VFS_I(dp), new_nlink); + return 0; +} + +/* + * Finish replaying stashed dirent updates, allocate a transaction for + * exchanging data fork mappings, and take the ILOCKs of both directories + * before we commit the new directory structure. + */ +STATIC int +xrep_dir_finalize_tempdir( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); + + /* + * Repair relies on the ILOCK to quiesce all possible dirent updates. + * Replay all queued dirent updates into the tempdir before exchanging + * the contents, even if that means dropping the ILOCKs and the + * transaction. + */ + do { + error = xrep_dir_replay_updates(rd); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); + if (error) + return error; + + if (xfarray_length(rd->dir_entries) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* Exchange the temporary directory's data fork with the one being repaired. */ +STATIC int +xrep_dir_swap( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + bool ip_local, temp_local; + int error = 0; + + /* + * If we never found the parent for this directory, temporarily assign + * the root dir as the parent; we'll move this to the orphanage after + * exchanging the dir contents. We hold the ILOCK of the dir being + * repaired, so we're not worried about racy updates of dotdot. + */ + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + if (rd->pscan.parent_ino == NULLFSINO) { + rd->needs_adoption = true; + rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino; + } + + /* + * Reset the temporary directory's '..' entry to point to the parent + * that we found. The temporary directory was created with the root + * directory as the parent, so we can skip this if repairing a + * subdirectory of the root. + * + * It's also possible that this replacement could also expand a sf + * tempdir into block format. + */ + if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) { + error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot, + rd->pscan.parent_ino, rd->tx.req.resblks); + if (error) + return error; + } + + /* + * Changing the dot and dotdot entries could have changed the shape of + * the directory, so we recompute these. + */ + ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both files have a local format data fork and the rebuilt + * directory data would fit in the repaired file's data fork, copy + * the contents from the tempfile and update the directory link count. + * We're done now. + */ + if (ip_local && temp_local && + sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { + xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); + return xrep_dir_set_nlink(rd); + } + + /* + * Clean the transaction before we start working on exchanging + * directory contents. + */ + error = xrep_tempfile_roll_trans(rd->sc); + if (error) + return error; + + /* Otherwise, make sure both data forks are in block-mapping mode. */ + error = xrep_dir_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + /* + * Set nlink of the directory in the same transaction sequence that + * (atomically) commits the new directory data. + */ + error = xrep_dir_set_nlink(rd); + if (error) + return error; + + return xrep_tempexch_contents(sc, &rd->tx); +} + +/* + * Exchange the new directory contents (which we created in the tempfile) with + * the directory being repaired. + */ +STATIC int +xrep_dir_rebuild_tree( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + int error; + + trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino); + + /* + * Take the IOLOCK on the temporary file so that we can run dir + * operations with the same locks held as we would for a normal file. + * We still hold sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rd->sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed dirent updates to the tempdir. After this point, + * we're ready to exchange data fork mappings. + */ + error = xrep_dir_finalize_tempdir(rd); + if (error) + return error; + + if (xchk_iscan_aborted(&rd->pscan.iscan)) + return -ECANCELED; + + /* + * Exchange the tempdir's data fork with the file being repaired. This + * recreates the transaction and re-takes the ILOCK in the scrub + * context. + */ + error = xrep_dir_swap(rd); + if (error) + return error; + + /* + * Release the old directory blocks and reset the data fork of the temp + * directory to an empty shortform directory because inactivation does + * nothing for directories. + */ + error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target directory. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + return 0; +} + +/* Set up the filesystem scan so we can regenerate directory entries. */ +STATIC int +xrep_dir_setup_scan( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + char *descr; + int error; + + /* Set up some staging memory for salvaging dirents. */ + descr = xchk_xfile_ino_descr(sc, "directory entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_dirent), + &rd->dir_entries); + kfree(descr); + if (error) + return error; + + descr = xchk_xfile_ino_descr(sc, "directory entry names"); + error = xfblob_create(descr, &rd->dir_names); + kfree(descr); + if (error) + goto out_xfarray; + + if (xfs_has_parent(sc->mp)) + error = __xrep_findparent_scan_start(sc, &rd->pscan, + xrep_dir_live_update); + else + error = xrep_findparent_scan_start(sc, &rd->pscan); + if (error) + goto out_xfblob; + + return 0; + +out_xfblob: + xfblob_destroy(rd->dir_names); + rd->dir_names = NULL; +out_xfarray: + xfarray_destroy(rd->dir_entries); + rd->dir_entries = NULL; + return error; +} + +/* + * Move the current file to the orphanage. + * + * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon + * successful return, the scrub transaction will have enough extra reservation + * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the + * orphanage; and both inodes will be ijoined. + */ +STATIC int +xrep_dir_move_to_orphanage( + struct xrep_dir *rd) +{ + struct xfs_scrub *sc = rd->sc; + xfs_ino_t orig_parent, new_parent; + int error; + + /* + * We are about to drop the ILOCK on sc->ip to lock the orphanage and + * prepare for the adoption. Therefore, look up the old dotdot entry + * for sc->ip so that we can compare it after we re-lock sc->ip. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent); + if (error) + return error; + + /* + * Drop the ILOCK on the scrub target and commit the transaction. + * Adoption computes its own resource requirements and gathers the + * necessary components. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* If we can take the orphanage's iolock then we're ready to move. */ + if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + xchk_iunlock(sc, sc->ilock_flags); + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; + } + + /* Grab transaction and ILOCK the two files. */ + error = xrep_adoption_trans_alloc(sc, &rd->adoption); + if (error) + return error; + + error = xrep_adoption_compute_name(&rd->adoption, &rd->xname); + if (error) + return error; + + /* + * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot + * entry again. If the parent changed or the child was unlinked while + * the child directory was unlocked, we don't need to move the child to + * the orphanage after all. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent); + if (error) + return error; + + /* + * Attach to the orphanage if we still have a linked directory and it + * hasn't been moved. + */ + if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { + error = xrep_adoption_move(&rd->adoption); + if (error) + return error; + } + + /* + * Launder the scrub transaction so we can drop the orphanage ILOCK + * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. + */ + error = xrep_adoption_trans_roll(&rd->adoption); + if (error) + return error; + + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* + * Repair the directory metadata. + * + * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer + * cache in XFS can't handle aliased multiblock buffers, so this might + * misbehave if the directory blocks are crosslinked with other filesystem + * metadata. + * + * XXX: Is it necessary to check the dcache for this directory to make sure + * that we always recreate every cached entry? + */ +int +xrep_directory( + struct xfs_scrub *sc) +{ + struct xrep_dir *rd = sc->buf; + int error; + + /* The rmapbt is required to reap the old data fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + error = xrep_dir_setup_scan(rd); + if (error) + return error; + + if (xfs_has_parent(sc->mp)) + error = xrep_dir_scan_dirtree(rd); + else + error = xrep_dir_salvage_entries(rd); + if (error) + goto out_teardown; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_teardown; + + error = xrep_dir_rebuild_tree(rd); + if (error) + goto out_teardown; + + if (rd->needs_adoption) { + if (!xrep_orphanage_can_adopt(rd->sc)) + error = -EFSCORRUPTED; + else + error = xrep_dir_move_to_orphanage(rd); + if (error) + goto out_teardown; + } + +out_teardown: + xrep_dir_teardown(sc); + return error; +} diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c new file mode 100644 index 000000000000..bde58fb561ea --- /dev/null +++ b/fs/xfs/scrub/dirtree.c @@ -0,0 +1,985 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/listxattr.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/orphanage.h" +#include "scrub/dirtree.h" + +/* + * Directory Tree Structure Validation + * =================================== + * + * Validating the tree qualities of the directory tree structure can be + * difficult. If the tree is frozen, running a depth (or breadth) first search + * and marking a bitmap suffices to determine if there is a cycle. XORing the + * mark bitmap with the inode bitmap afterwards tells us if there are + * disconnected cycles. If the tree is not frozen, directory updates can move + * subtrees across the scanner wavefront, which complicates the design greatly. + * + * Directory parent pointers change that by enabling an incremental approach to + * validation of the tree structure. Instead of using one thread to scan the + * entire filesystem, we instead can have multiple threads walking individual + * subdirectories upwards to the root. In a perfect world, the IOLOCK would + * suffice to stabilize two directories in a parent -> child relationship. + * Unfortunately, the VFS does not take the IOLOCK when moving a child + * subdirectory, so we instead synchronize on ILOCK and use dirent update hooks + * to detect a race. If a race occurs in a path, we restart the scan. + * + * If the walk terminates without reaching the root, we know the path is + * disconnected and ought to be attached to the lost and found. If on the walk + * we find the same subdir that we're scanning, we know this is a cycle and + * should delete an incoming edge. If we find multiple paths to the root, we + * know to delete an incoming edge. + * + * There are two big hitches with this approach: first, all file link counts + * must be correct to prevent other writers from doing the wrong thing with the + * directory tree structure. Second, because we're walking upwards in a tree + * of arbitrary depth, we cannot hold all the ILOCKs. Instead, we will use a + * directory update hook to invalidate the scan results if one of the paths + * we've scanned has changed. + */ + +/* Clean up the dirtree checking resources. */ +STATIC void +xchk_dirtree_buf_cleanup( + void *buf) +{ + struct xchk_dirtree *dl = buf; + struct xchk_dirpath *path, *n; + + if (dl->scan_ino != NULLFSINO) + xfs_dir_hook_del(dl->sc->mp, &dl->dhook); + + xchk_dirtree_for_each_path_safe(dl, path, n) { + list_del_init(&path->list); + xino_bitmap_destroy(&path->seen_inodes); + kfree(path); + } + + xfblob_destroy(dl->path_names); + xfarray_destroy(dl->path_steps); + mutex_destroy(&dl->lock); +} + +/* Set us up to look for directory loops. */ +int +xchk_setup_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree *dl; + char *descr; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + if (xchk_could_repair(sc)) { + error = xrep_setup_dirtree(sc); + if (error) + return error; + } + + dl = kvzalloc(sizeof(struct xchk_dirtree), XCHK_GFP_FLAGS); + if (!dl) + return -ENOMEM; + dl->sc = sc; + dl->xname.name = dl->namebuf; + dl->hook_xname.name = dl->hook_namebuf; + INIT_LIST_HEAD(&dl->path_list); + dl->root_ino = NULLFSINO; + dl->scan_ino = NULLFSINO; + dl->parent_ino = NULLFSINO; + + mutex_init(&dl->lock); + + descr = xchk_xfile_ino_descr(sc, "dirtree path steps"); + error = xfarray_create(descr, 0, sizeof(struct xchk_dirpath_step), + &dl->path_steps); + kfree(descr); + if (error) + goto out_dl; + + descr = xchk_xfile_ino_descr(sc, "dirtree path names"); + error = xfblob_create(descr, &dl->path_names); + kfree(descr); + if (error) + goto out_steps; + + error = xchk_setup_inode_contents(sc, 0); + if (error) + goto out_names; + + sc->buf = dl; + sc->buf_cleanup = xchk_dirtree_buf_cleanup; + return 0; + +out_names: + xfblob_destroy(dl->path_names); +out_steps: + xfarray_destroy(dl->path_steps); +out_dl: + mutex_destroy(&dl->lock); + kvfree(dl); + return error; +} + +/* + * Add the parent pointer described by @dl->pptr to the given path as a new + * step. Returns -ELNRNG if the path is too deep. + */ +int +xchk_dirpath_append( + struct xchk_dirtree *dl, + struct xfs_inode *ip, + struct xchk_dirpath *path, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr) +{ + struct xchk_dirpath_step step = { + .pptr_rec = *pptr, /* struct copy */ + .name_len = name->len, + }; + int error; + + /* + * If this path is more than 2 billion steps long, this directory tree + * is too far gone to fix. + */ + if (path->nr_steps >= XFS_MAXLINK) + return -ELNRNG; + + error = xfblob_storename(dl->path_names, &step.name_cookie, name); + if (error) + return error; + + error = xino_bitmap_set(&path->seen_inodes, ip->i_ino); + if (error) + return error; + + error = xfarray_append(dl->path_steps, &step); + if (error) + return error; + + path->nr_steps++; + return 0; +} + +/* + * Create an xchk_path for each parent pointer of the directory that we're + * scanning. For each path created, we will eventually try to walk towards the + * root with the goal of deleting all parents except for one that leads to the + * root. + * + * Returns -EFSCORRUPTED to signal that the inode being scanned has a corrupt + * parent pointer and hence there's no point in continuing; or -ENOSR if there + * are too many parent pointers for this directory. + */ +STATIC int +xchk_dirtree_create_path( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_dirtree *dl = priv; + struct xchk_dirpath *path; + const struct xfs_parent_rec *rec = value; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + /* + * If there are more than 2 billion actual parent pointers for this + * subdirectory, this fs is too far gone to fix. + */ + if (dl->nr_paths >= XFS_MAXLINK) + return -ENOSR; + + trace_xchk_dirtree_create_path(sc, ip, dl->nr_paths, &xname, rec); + + /* + * Create a new xchk_path structure to remember this parent pointer + * and record the first name step. + */ + path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS); + if (!path) + return -ENOMEM; + + INIT_LIST_HEAD(&path->list); + xino_bitmap_init(&path->seen_inodes); + path->nr_steps = 0; + path->outcome = XCHK_DIRPATH_SCANNING; + + error = xchk_dirpath_append(dl, sc->ip, path, &xname, rec); + if (error) + goto out_path; + + path->first_step = xfarray_length(dl->path_steps) - 1; + path->second_step = XFARRAY_NULLIDX; + path->path_nr = dl->nr_paths; + + list_add_tail(&path->list, &dl->path_list); + dl->nr_paths++; + return 0; +out_path: + kfree(path); + return error; +} + +/* + * Validate that the first step of this path still has a corresponding + * parent pointer in @sc->ip. We probably dropped @sc->ip's ILOCK while + * walking towards the roots, which is why this is necessary. + * + * This function has a side effect of loading the first parent pointer of this + * path into the parent pointer scratch pad. This prepares us to walk up the + * directory tree towards the root. Returns -ESTALE if the scan data is now + * out of date. + */ +STATIC int +xchk_dirpath_revalidate( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* + * Look up the parent pointer that corresponds to the start of this + * path. If the parent pointer has disappeared on us, dump all the + * scan results and try again. + */ + error = xfs_parent_lookup(sc->tp, sc->ip, &dl->xname, &dl->pptr_rec, + &dl->pptr_args); + if (error == -ENOATTR) { + trace_xchk_dirpath_disappeared(dl->sc, sc->ip, path->path_nr, + path->first_step, &dl->xname, &dl->pptr_rec); + dl->stale = true; + return -ESTALE; + } + + return error; +} + +/* + * Walk the parent pointers of a directory at the end of a path and record + * the parent that we find in @dl->xname/pptr_rec. + */ +STATIC int +xchk_dirpath_find_next_step( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_dirtree *dl = priv; + const struct xfs_parent_rec *rec = value; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + /* + * If we've already set @dl->pptr_rec, then this directory has multiple + * parents. Signal this back to the caller via -EMLINK. + */ + if (dl->parents_found > 0) + return -EMLINK; + + dl->parents_found++; + memcpy(dl->namebuf, name, namelen); + dl->xname.len = namelen; + dl->pptr_rec = *rec; /* struct copy */ + return 0; +} + +/* Set and log the outcome of a path walk. */ +static inline void +xchk_dirpath_set_outcome( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + enum xchk_dirpath_outcome outcome) +{ + trace_xchk_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps, + outcome); + + path->outcome = outcome; +} + +/* + * Scan the directory at the end of this path for its parent directory link. + * If we find one, extend the path. Returns -ESTALE if the scan data out of + * date. Returns -EFSCORRUPTED if the parent pointer is bad; or -ELNRNG if + * the path got too deep. + */ +STATIC int +xchk_dirpath_step_up( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + struct xfs_inode *dp; + xfs_ino_t parent_ino = be64_to_cpu(dl->pptr_rec.p_ino); + unsigned int lock_mode; + int error; + + /* Grab and lock the parent directory. */ + error = xchk_iget(sc, parent_ino, &dp); + if (error) + return error; + + lock_mode = xfs_ilock_attr_map_shared(dp); + mutex_lock(&dl->lock); + + if (dl->stale) { + error = -ESTALE; + goto out_scanlock; + } + + /* We've reached the root directory; the path is ok. */ + if (parent_ino == dl->root_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_OK); + error = 0; + goto out_scanlock; + } + + /* + * The inode being scanned is its own distant ancestor! Get rid of + * this path. + */ + if (parent_ino == sc->ip->i_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + error = 0; + goto out_scanlock; + } + + /* + * We've seen this inode before during the path walk. There's a loop + * above us in the directory tree. This probably means that we cannot + * continue, but let's keep walking paths to get a full picture. + */ + if (xino_bitmap_test(&path->seen_inodes, parent_ino)) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_LOOP); + error = 0; + goto out_scanlock; + } + + /* The handle encoded in the parent pointer must match. */ + if (VFS_I(dp)->i_generation != be32_to_cpu(dl->pptr_rec.p_gen)) { + trace_xchk_dirpath_badgen(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* Parent pointer must point up to a directory. */ + if (!S_ISDIR(VFS_I(dp)->i_mode)) { + trace_xchk_dirpath_nondir_parent(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* Parent cannot be an unlinked directory. */ + if (VFS_I(dp)->i_nlink == 0) { + trace_xchk_dirpath_unlinked_parent(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + + /* + * If the extended attributes look as though they has been zapped by + * the inode record repair code, we cannot scan for parent pointers. + */ + if (xchk_pptr_looks_zapped(dp)) { + error = -EBUSY; + xchk_set_incomplete(sc); + goto out_scanlock; + } + + /* + * Walk the parent pointers of @dp to find the parent of this directory + * to find the next step in our walk. If we find that @dp has exactly + * one parent, the parent pointer information will be stored in + * @dl->pptr_rec. This prepares us for the next step of the walk. + */ + mutex_unlock(&dl->lock); + dl->parents_found = 0; + error = xchk_xattr_walk(sc, dp, xchk_dirpath_find_next_step, NULL, dl); + mutex_lock(&dl->lock); + if (error == -EFSCORRUPTED || error == -EMLINK || + (!error && dl->parents_found == 0)) { + /* + * Further up the directory tree from @sc->ip, we found a + * corrupt parent pointer, multiple parent pointers while + * finding this directory's parent, or zero parents despite + * having a nonzero link count. Keep looking for other paths. + */ + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT); + error = 0; + goto out_scanlock; + } + if (error) + goto out_scanlock; + + if (dl->stale) { + error = -ESTALE; + goto out_scanlock; + } + + trace_xchk_dirpath_found_next_step(sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + + /* Append to the path steps */ + error = xchk_dirpath_append(dl, dp, path, &dl->xname, &dl->pptr_rec); + if (error) + goto out_scanlock; + + if (path->second_step == XFARRAY_NULLIDX) + path->second_step = xfarray_length(dl->path_steps) - 1; + +out_scanlock: + mutex_unlock(&dl->lock); + xfs_iunlock(dp, lock_mode); + xchk_irele(sc, dp); + return error; +} + +/* + * Walk the directory tree upwards towards what is hopefully the root + * directory, recording path steps as we go. The current path components are + * stored in dl->pptr_rec and dl->xname. + * + * Returns -ESTALE if the scan data are out of date. Returns -EFSCORRUPTED + * only if the direct parent pointer of @sc->ip associated with this path is + * corrupt. + */ +STATIC int +xchk_dirpath_walk_upwards( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + /* Reload the start of this path and make sure it's still there. */ + error = xchk_dirpath_revalidate(dl, path); + if (error) + return error; + + trace_xchk_dirpath_walk_upwards(sc, sc->ip, path->path_nr, &dl->xname, + &dl->pptr_rec); + + /* + * The inode being scanned is its own direct ancestor! + * Get rid of this path. + */ + if (be64_to_cpu(dl->pptr_rec.p_ino) == sc->ip->i_ino) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + return 0; + } + + /* + * Drop ILOCK_EXCL on the inode being scanned. We still hold + * IOLOCK_EXCL on it, so it cannot move around or be renamed. + * + * Beyond this point we're walking up the directory tree, which means + * that we can acquire and drop the ILOCK on an alias of sc->ip. The + * ILOCK state is no longer tracked in the scrub context. Hence we + * must drop @sc->ip's ILOCK during the walk. + */ + mutex_unlock(&dl->lock); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* + * Take the first step in the walk towards the root by checking the + * start of this path, which is a direct parent pointer of @sc->ip. + * If we see any kind of error here (including corruptions), the parent + * pointer of @sc->ip is corrupt. Stop the whole scan. + */ + error = xchk_dirpath_step_up(dl, path); + if (error) { + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + return error; + } + + /* + * Take steps upward from the second step in this path towards the + * root. If we hit corruption errors here, there's a problem + * *somewhere* in the path, but we don't need to stop scanning. + */ + while (!error && path->outcome == XCHK_DIRPATH_SCANNING) + error = xchk_dirpath_step_up(dl, path); + + /* Retake the locks we had, mark paths, etc. */ + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + if (error == -EFSCORRUPTED) { + xchk_dirpath_set_outcome(dl, path, XCHK_DIRPATH_CORRUPT); + error = 0; + } + if (!error && dl->stale) + return -ESTALE; + return error; +} + +/* + * Decide if this path step has been touched by this live update. Returns + * 1 for yes, 0 for no, or a negative errno. + */ +STATIC int +xchk_dirpath_step_is_stale( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + unsigned int step_nr, + xfarray_idx_t step_idx, + struct xfs_dir_update_params *p, + xfs_ino_t *cursor) +{ + struct xchk_dirpath_step step; + xfs_ino_t child_ino = *cursor; + int error; + + error = xfarray_load(dl->path_steps, step_idx, &step); + if (error) + return error; + *cursor = be64_to_cpu(step.pptr_rec.p_ino); + + /* + * If the parent and child being updated are not the ones mentioned in + * this path step, the scan data is still ok. + */ + if (p->ip->i_ino != child_ino || p->dp->i_ino != *cursor) + return 0; + + /* + * If the dirent name lengths or byte sequences are different, the scan + * data is still ok. + */ + if (p->name->len != step.name_len) + return 0; + + error = xfblob_loadname(dl->path_names, step.name_cookie, + &dl->hook_xname, step.name_len); + if (error) + return error; + + if (memcmp(dl->hook_xname.name, p->name->name, p->name->len) != 0) + return 0; + + /* + * If the update comes from the repair code itself, walk the state + * machine forward. + */ + if (p->ip->i_ino == dl->scan_ino && + path->outcome == XREP_DIRPATH_ADOPTING) { + xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_ADOPTED); + return 0; + } + + if (p->ip->i_ino == dl->scan_ino && + path->outcome == XREP_DIRPATH_DELETING) { + xchk_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETED); + return 0; + } + + /* Exact match, scan data is out of date. */ + trace_xchk_dirpath_changed(dl->sc, path->path_nr, step_nr, p->dp, + p->ip, p->name); + return 1; +} + +/* + * Decide if this path has been touched by this live update. Returns 1 for + * yes, 0 for no, or a negative errno. + */ +STATIC int +xchk_dirpath_is_stale( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + struct xfs_dir_update_params *p) +{ + xfs_ino_t cursor = dl->scan_ino; + xfarray_idx_t idx = path->first_step; + unsigned int i; + int ret; + + /* + * The child being updated has not been seen by this path at all; this + * path cannot be stale. + */ + if (!xino_bitmap_test(&path->seen_inodes, p->ip->i_ino)) + return 0; + + ret = xchk_dirpath_step_is_stale(dl, path, 0, idx, p, &cursor); + if (ret != 0) + return ret; + + for (i = 1, idx = path->second_step; i < path->nr_steps; i++, idx++) { + ret = xchk_dirpath_step_is_stale(dl, path, i, idx, p, &cursor); + if (ret != 0) + return ret; + } + + return 0; +} + +/* + * Decide if a directory update from the regular filesystem touches any of the + * paths we've scanned, and invalidate the scan data if true. + */ +STATIC int +xchk_dirtree_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xchk_dirtree *dl; + struct xchk_dirpath *path; + int ret; + + dl = container_of(nb, struct xchk_dirtree, dhook.dirent_hook.nb); + + trace_xchk_dirtree_live_update(dl->sc, p->dp, action, p->ip, p->delta, + p->name); + + mutex_lock(&dl->lock); + + if (dl->stale || dl->aborted) + goto out_unlock; + + xchk_dirtree_for_each_path(dl, path) { + ret = xchk_dirpath_is_stale(dl, path, p); + if (ret < 0) { + dl->aborted = true; + break; + } + if (ret == 1) { + dl->stale = true; + break; + } + } + +out_unlock: + mutex_unlock(&dl->lock); + return NOTIFY_DONE; +} + +/* Delete all the collected path information. */ +STATIC void +xchk_dirtree_reset( + void *buf) +{ + struct xchk_dirtree *dl = buf; + struct xchk_dirpath *path, *n; + + ASSERT(dl->sc->ilock_flags & XFS_ILOCK_EXCL); + + xchk_dirtree_for_each_path_safe(dl, path, n) { + list_del_init(&path->list); + xino_bitmap_destroy(&path->seen_inodes); + kfree(path); + } + dl->nr_paths = 0; + + xfarray_truncate(dl->path_steps); + xfblob_truncate(dl->path_names); + + dl->stale = false; +} + +/* + * Load the name/pptr from the first step in this path into @dl->pptr_rec and + * @dl->xname. + */ +STATIC int +xchk_dirtree_load_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + int error; + + error = xfarray_load(dl->path_steps, path->first_step, &step); + if (error) + return error; + + error = xfblob_loadname(dl->path_names, step.name_cookie, &dl->xname, + step.name_len); + if (error) + return error; + + dl->pptr_rec = step.pptr_rec; /* struct copy */ + return 0; +} + +/* + * For each parent pointer of this subdir, trace a path upwards towards the + * root directory and record what we find. Returns 0 for success; + * -EFSCORRUPTED if walking the parent pointers of @sc->ip failed, -ELNRNG if a + * path was too deep; -ENOSR if there were too many parent pointers; or + * a negative errno. + */ +int +xchk_dirtree_find_paths_to_root( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + struct xchk_dirpath *path; + int error = 0; + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + xchk_dirtree_reset(dl); + + /* + * If the extended attributes look as though they has been + * zapped by the inode record repair code, we cannot scan for + * parent pointers. + */ + if (xchk_pptr_looks_zapped(sc->ip)) { + xchk_set_incomplete(sc); + return -EBUSY; + } + + /* + * Create path walk contexts for each parent of the directory + * that is being scanned. Directories are supposed to have + * only one parent, but this is how we detect multiple parents. + */ + error = xchk_xattr_walk(sc, sc->ip, xchk_dirtree_create_path, + NULL, dl); + if (error) + return error; + + xchk_dirtree_for_each_path(dl, path) { + /* Load path components into dl->pptr/xname */ + error = xchk_dirtree_load_path(dl, path); + if (error) + return error; + + /* + * Try to walk up each path to the root. This enables + * us to find directory loops in ancestors, and the + * like. + */ + error = xchk_dirpath_walk_upwards(dl, path); + if (error == -EFSCORRUPTED) { + /* + * A parent pointer of @sc->ip is bad, don't + * bother continuing. + */ + break; + } + if (error == -ESTALE) { + /* This had better be an invalidation. */ + ASSERT(dl->stale); + break; + } + if (error) + return error; + if (dl->aborted) + return 0; + } + } while (dl->stale); + + return error; +} + +/* + * Figure out what to do with the paths we tried to find. Do not call this + * if the scan results are stale. + */ +void +xchk_dirtree_evaluate( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + + ASSERT(!dl->stale); + + /* Scan the paths we have to decide what to do. */ + memset(oc, 0, sizeof(struct xchk_dirtree_outcomes)); + xchk_dirtree_for_each_path(dl, path) { + trace_xchk_dirpath_evaluate_path(dl->sc, path->path_nr, + path->nr_steps, path->outcome); + + switch (path->outcome) { + case XCHK_DIRPATH_SCANNING: + /* shouldn't get here */ + ASSERT(0); + break; + case XCHK_DIRPATH_DELETE: + /* This one is already going away. */ + oc->bad++; + break; + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + /* Couldn't find the end of this path. */ + oc->suspect++; + break; + case XCHK_DIRPATH_STALE: + /* shouldn't get here either */ + ASSERT(0); + break; + case XCHK_DIRPATH_OK: + /* This path got all the way to the root. */ + oc->good++; + break; + case XREP_DIRPATH_DELETING: + case XREP_DIRPATH_DELETED: + case XREP_DIRPATH_ADOPTING: + case XREP_DIRPATH_ADOPTED: + /* These should not be in progress! */ + ASSERT(0); + break; + } + } + + trace_xchk_dirtree_evaluate(dl, oc); +} + +/* Look for directory loops. */ +int +xchk_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree_outcomes oc; + struct xchk_dirtree *dl = sc->buf; + int error; + + /* + * Nondirectories do not point downwards to other files, so they cannot + * cause a cycle in the directory tree. + */ + if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) + return -ENOENT; + + ASSERT(xfs_has_parent(sc->mp)); + + /* + * Find the root of the directory tree. Remember which directory to + * scan, because the hook doesn't detach until after sc->ip gets + * released during teardown. + */ + dl->root_ino = sc->mp->m_rootip->i_ino; + dl->scan_ino = sc->ip->i_ino; + + trace_xchk_dirtree_start(sc->ip, sc->sm, 0); + + /* + * Hook into the directory entry code so that we can capture updates to + * paths that we have already scanned. The scanner thread takes each + * directory's ILOCK, which means that any in-progress directory update + * will finish before we can scan the directory. + */ + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + xfs_dir_hook_setup(&dl->dhook, xchk_dirtree_live_update); + error = xfs_dir_hook_add(sc->mp, &dl->dhook); + if (error) + goto out; + + mutex_lock(&dl->lock); + + /* Trace each parent pointer's path to the root. */ + error = xchk_dirtree_find_paths_to_root(dl); + if (error == -EFSCORRUPTED || error == -ELNRNG || error == -ENOSR) { + /* + * Don't bother walking the paths if the xattr structure or the + * parent pointers are corrupt; this scan cannot be completed + * without full information. + */ + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); + error = 0; + goto out_scanlock; + } + if (error == -EBUSY) { + /* + * We couldn't scan some directory's parent pointers because + * the attr fork looked like it had been zapped. The + * scan was marked incomplete, so no further error code + * is necessary. + */ + error = 0; + goto out_scanlock; + } + if (error) + goto out_scanlock; + if (dl->aborted) { + xchk_set_incomplete(sc); + goto out_scanlock; + } + + /* Assess what we found in our path evaluation. */ + xchk_dirtree_evaluate(dl, &oc); + if (xchk_dirtree_parentless(dl)) { + if (oc.good || oc.bad || oc.suspect) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (oc.bad || oc.good + oc.suspect != 1) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + if (oc.suspect) + xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); + } + +out_scanlock: + mutex_unlock(&dl->lock); +out: + trace_xchk_dirtree_done(sc->ip, sc->sm, error); + return error; +} diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h new file mode 100644 index 000000000000..1e1686365c61 --- /dev/null +++ b/fs/xfs/scrub/dirtree.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_DIRTREE_H__ +#define __XFS_SCRUB_DIRTREE_H__ + +/* + * Each of these represents one parent pointer path step in a chain going + * up towards the directory tree root. These are stored inside an xfarray. + */ +struct xchk_dirpath_step { + /* Directory entry name associated with this parent link. */ + xfblob_cookie name_cookie; + unsigned int name_len; + + /* Handle of the parent directory. */ + struct xfs_parent_rec pptr_rec; +}; + +enum xchk_dirpath_outcome { + XCHK_DIRPATH_SCANNING = 0, /* still being put together */ + XCHK_DIRPATH_DELETE, /* delete this path */ + XCHK_DIRPATH_CORRUPT, /* corruption detected in path */ + XCHK_DIRPATH_LOOP, /* cycle detected further up */ + XCHK_DIRPATH_STALE, /* path is stale */ + XCHK_DIRPATH_OK, /* path reaches the root */ + + XREP_DIRPATH_DELETING, /* path is being deleted */ + XREP_DIRPATH_DELETED, /* path has been deleted */ + XREP_DIRPATH_ADOPTING, /* path is being adopted */ + XREP_DIRPATH_ADOPTED, /* path has been adopted */ +}; + +/* + * Each of these represents one parent pointer path out of the directory being + * scanned. These exist in-core, and hopefully there aren't more than a + * handful of them. + */ +struct xchk_dirpath { + struct list_head list; + + /* Index of the first step in this path. */ + xfarray_idx_t first_step; + + /* Index of the second step in this path. */ + xfarray_idx_t second_step; + + /* Inodes seen while walking this path. */ + struct xino_bitmap seen_inodes; + + /* Number of steps in this path. */ + unsigned int nr_steps; + + /* Which path is this? */ + unsigned int path_nr; + + /* What did we conclude from following this path? */ + enum xchk_dirpath_outcome outcome; +}; + +struct xchk_dirtree_outcomes { + /* Number of XCHK_DIRPATH_DELETE */ + unsigned int bad; + + /* Number of XCHK_DIRPATH_CORRUPT or XCHK_DIRPATH_LOOP */ + unsigned int suspect; + + /* Number of XCHK_DIRPATH_OK */ + unsigned int good; + + /* Directory needs to be added to lost+found */ + bool needs_adoption; +}; + +struct xchk_dirtree { + struct xfs_scrub *sc; + + /* Root inode that we're looking for. */ + xfs_ino_t root_ino; + + /* + * This is the inode that we're scanning. The live update hook can + * continue to be called after xchk_teardown drops sc->ip but before + * it calls buf_cleanup, so we keep a copy. + */ + xfs_ino_t scan_ino; + + /* + * If we start deleting redundant paths to this subdirectory, this is + * the inode number of the surviving parent and the dotdot entry will + * be set to this value. If the value is NULLFSINO, then use @root_ino + * as a stand-in until the orphanage can adopt the subdirectory. + */ + xfs_ino_t parent_ino; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_parent_rec pptr_rec; + struct xfs_da_args pptr_args; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; + + /* Information for reparenting this directory. */ + struct xrep_adoption adoption; + + /* + * Hook into directory updates so that we can receive live updates + * from other writer threads. + */ + struct xfs_dir_hook dhook; + + /* Parent pointer update arguments. */ + struct xfs_parent_args ppargs; + + /* lock for everything below here */ + struct mutex lock; + + /* buffer for the live update functions to use for dirent names */ + struct xfs_name hook_xname; + unsigned char hook_namebuf[MAXNAMELEN]; + + /* + * All path steps observed during this scan. Each of the path + * steps for a particular pathwalk are recorded in sequential + * order in the xfarray. A pathwalk ends either with a step + * pointing to the root directory (success) or pointing to NULLFSINO + * (loop detected, empty dir detected, etc). + */ + struct xfarray *path_steps; + + /* All names observed during this scan. */ + struct xfblob *path_names; + + /* All paths being tracked by this scanner. */ + struct list_head path_list; + + /* Number of paths in path_list. */ + unsigned int nr_paths; + + /* Number of parents found by a pptr scan. */ + unsigned int parents_found; + + /* Have the path data been invalidated by a concurrent update? */ + bool stale:1; + + /* Has the scan been aborted? */ + bool aborted:1; +}; + +#define xchk_dirtree_for_each_path_safe(dl, path, n) \ + list_for_each_entry_safe((path), (n), &(dl)->path_list, list) + +#define xchk_dirtree_for_each_path(dl, path) \ + list_for_each_entry((path), &(dl)->path_list, list) + +static inline bool +xchk_dirtree_parentless(const struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + + if (sc->ip == sc->mp->m_rootip) + return true; + if (VFS_I(sc->ip)->i_nlink == 0) + return true; + return false; +} + +int xchk_dirtree_find_paths_to_root(struct xchk_dirtree *dl); +int xchk_dirpath_append(struct xchk_dirtree *dl, struct xfs_inode *ip, + struct xchk_dirpath *path, const struct xfs_name *name, + const struct xfs_parent_rec *pptr); +void xchk_dirtree_evaluate(struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc); + +#endif /* __XFS_SCRUB_DIRTREE_H__ */ diff --git a/fs/xfs/scrub/dirtree_repair.c b/fs/xfs/scrub/dirtree_repair.c new file mode 100644 index 000000000000..5c04e70ba951 --- /dev/null +++ b/fs/xfs/scrub/dirtree_repair.c @@ -0,0 +1,821 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_trans_space.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/listxattr.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/orphanage.h" +#include "scrub/dirtree.h" +#include "scrub/readdir.h" + +/* + * Directory Tree Structure Repairs + * ================================ + * + * If we decide that the directory being scanned is participating in a + * directory loop, the only change we can make is to remove directory entries + * pointing down to @sc->ip. If that leaves it with no parents, the directory + * should be adopted by the orphanage. + */ + +/* Set up to repair directory loops. */ +int +xrep_setup_dirtree( + struct xfs_scrub *sc) +{ + return xrep_orphanage_try_create(sc); +} + +/* Change the outcome of this path. */ +static inline void +xrep_dirpath_set_outcome( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + enum xchk_dirpath_outcome outcome) +{ + trace_xrep_dirpath_set_outcome(dl->sc, path->path_nr, path->nr_steps, + outcome); + + path->outcome = outcome; +} + +/* Delete all paths. */ +STATIC void +xrep_dirtree_delete_all_paths( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + oc->good--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 0); + ASSERT(oc->good == 0); +} + +/* Since this is the surviving path, set the dotdot entry to this value. */ +STATIC void +xrep_dirpath_retain_parent( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + int error; + + error = xfarray_load(dl->path_steps, path->first_step, &step); + if (error) + return; + + dl->parent_ino = be64_to_cpu(step.pptr_rec.p_ino); +} + +/* Find the one surviving path so we know how to set dotdot. */ +STATIC void +xrep_dirtree_find_surviving_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + case XCHK_DIRPATH_OK: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + ASSERT(foundit == false); + break; + default: + break; + } + } + + ASSERT(oc->suspect + oc->good == 1); +} + +/* Delete all paths except for the one good one. */ +STATIC void +xrep_dirtree_keep_one_good_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + oc->good--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 0); + ASSERT(oc->good < 2); +} + +/* Delete all paths except for one suspect one. */ +STATIC void +xrep_dirtree_keep_one_suspect_path( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + bool foundit = false; + + xchk_dirtree_for_each_path(dl, path) { + switch (path->outcome) { + case XCHK_DIRPATH_CORRUPT: + case XCHK_DIRPATH_LOOP: + if (!foundit) { + xrep_dirpath_retain_parent(dl, path); + foundit = true; + continue; + } + oc->suspect--; + oc->bad++; + xrep_dirpath_set_outcome(dl, path, XCHK_DIRPATH_DELETE); + break; + case XCHK_DIRPATH_OK: + ASSERT(0); + break; + default: + break; + } + } + + ASSERT(oc->suspect == 1); + ASSERT(oc->good == 0); +} + +/* + * Figure out what to do with the paths we tried to find. Returns -EDEADLOCK + * if the scan results have become stale. + */ +STATIC void +xrep_dirtree_decide_fate( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + xchk_dirtree_evaluate(dl, oc); + + /* Parentless directories should not have any paths at all. */ + if (xchk_dirtree_parentless(dl)) { + xrep_dirtree_delete_all_paths(dl, oc); + return; + } + + /* One path is exactly the number of paths we want. */ + if (oc->good + oc->suspect == 1) { + xrep_dirtree_find_surviving_path(dl, oc); + return; + } + + /* Zero paths means we should reattach the subdir to the orphanage. */ + if (oc->good + oc->suspect == 0) { + if (dl->sc->orphanage) + oc->needs_adoption = true; + return; + } + + /* + * Otherwise, this subdirectory has too many parents. If there's at + * least one good path, keep it and delete the others. + */ + if (oc->good > 0) { + xrep_dirtree_keep_one_good_path(dl, oc); + return; + } + + /* + * There are no good paths and there are too many suspect paths. + * Keep the first suspect path and delete the rest. + */ + xrep_dirtree_keep_one_suspect_path(dl, oc); +} + +/* + * Load the first step of this path into @step and @dl->xname/pptr + * for later repair work. + */ +STATIC int +xrep_dirtree_prep_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path, + struct xchk_dirpath_step *step) +{ + int error; + + error = xfarray_load(dl->path_steps, path->first_step, step); + if (error) + return error; + + error = xfblob_loadname(dl->path_names, step->name_cookie, &dl->xname, + step->name_len); + if (error) + return error; + + dl->pptr_rec = step->pptr_rec; /* struct copy */ + return 0; +} + +/* Delete the VFS dentry for a removed child. */ +STATIC int +xrep_dirtree_purge_dentry( + struct xchk_dirtree *dl, + struct xfs_inode *dp, + const struct xfs_name *name) +{ + struct qstr qname = QSTR_INIT(name->name, name->len); + struct dentry *parent_dentry, *child_dentry; + int error = 0; + + /* + * Find the dentry for the parent directory. If there isn't one, we're + * done. Caller already holds i_rwsem for parent and child. + */ + parent_dentry = d_find_alias(VFS_I(dp)); + if (!parent_dentry) + return 0; + + /* The VFS thinks the parent is a directory, right? */ + if (!d_is_dir(parent_dentry)) { + ASSERT(d_is_dir(parent_dentry)); + error = -EFSCORRUPTED; + goto out_dput_parent; + } + + /* + * Try to find the dirent pointing to the child. If there isn't one, + * we're done. + */ + qname.hash = full_name_hash(parent_dentry, name->name, name->len); + child_dentry = d_lookup(parent_dentry, &qname); + if (!child_dentry) { + error = 0; + goto out_dput_parent; + } + + trace_xrep_dirtree_delete_child(dp->i_mount, child_dentry); + + /* Child is not a directory? We're screwed. */ + if (!d_is_dir(child_dentry)) { + ASSERT(d_is_dir(child_dentry)); + error = -EFSCORRUPTED; + goto out_dput_child; + } + + /* Replace the child dentry with a negative one. */ + d_delete(child_dentry); + +out_dput_child: + dput(child_dentry); +out_dput_parent: + dput(parent_dentry); + return error; +} + +/* + * Prepare to delete a link by taking the IOLOCK of the parent and the child + * (scrub target). Caller must hold IOLOCK_EXCL on @sc->ip. Returns 0 if we + * took both locks, or a negative errno if we couldn't lock the parent in time. + */ +static inline int +xrep_dirtree_unlink_iolock( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + int error; + + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + + if (xfs_ilock_nowait(dp, XFS_IOLOCK_EXCL)) + return 0; + + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + do { + xfs_ilock(dp, XFS_IOLOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xfs_iunlock(dp, XFS_IOLOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + return error; + } + + delay(1); + } while (1); + + return 0; +} + +/* + * Remove a link from the directory tree and update the dcache. Returns + * -ESTALE if the scan data are now out of date. + */ +STATIC int +xrep_dirtree_unlink( + struct xchk_dirtree *dl, + struct xfs_inode *dp, + struct xchk_dirpath *path, + struct xchk_dirpath_step *step) +{ + struct xfs_scrub *sc = dl->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t dotdot_ino; + xfs_ino_t parent_ino = dl->parent_ino; + unsigned int resblks; + int dontcare; + int error; + + /* Take IOLOCK_EXCL of the parent and child. */ + error = xrep_dirtree_unlink_iolock(sc, dp); + if (error) + return error; + + /* + * Create the transaction that we need to sever the path. Ignore + * EDQUOT and ENOSPC being returned via nospace_error because the + * directory code can handle a reservationless update. + */ + resblks = xfs_remove_space_res(mp, step->name_len); + error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, sc->ip, + &resblks, &sc->tp, &dontcare); + if (error) + goto out_iolock; + + /* + * Cancel if someone invalidate the paths while we were trying to get + * the ILOCK. + */ + mutex_lock(&dl->lock); + if (dl->stale) { + mutex_unlock(&dl->lock); + error = -ESTALE; + goto out_trans_cancel; + } + xrep_dirpath_set_outcome(dl, path, XREP_DIRPATH_DELETING); + mutex_unlock(&dl->lock); + + trace_xrep_dirtree_delete_path(dl->sc, sc->ip, path->path_nr, + &dl->xname, &dl->pptr_rec); + + /* + * Decide if we need to reset the dotdot entry. Rules: + * + * - If there's a surviving parent, we want dotdot to point there. + * - If we don't have any surviving parents, then point dotdot at the + * root dir. + * - If dotdot is already set to the value we want, pass in NULLFSINO + * for no change necessary. + * + * Do this /before/ we dirty anything, in case the dotdot lookup + * fails. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &dotdot_ino); + if (error) + goto out_trans_cancel; + if (parent_ino == NULLFSINO) + parent_ino = dl->root_ino; + if (dotdot_ino == parent_ino) + parent_ino = NULLFSINO; + + /* Drop the link from sc->ip's dotdot entry. */ + error = xfs_droplink(sc->tp, dp); + if (error) + goto out_trans_cancel; + + /* Reset the dotdot entry to a surviving parent. */ + if (parent_ino != NULLFSINO) { + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + parent_ino, 0); + if (error) + goto out_trans_cancel; + } + + /* Drop the link from dp to sc->ip. */ + error = xfs_droplink(sc->tp, sc->ip); + if (error) + goto out_trans_cancel; + + error = xfs_dir_removename(sc->tp, dp, &dl->xname, sc->ip->i_ino, + resblks); + if (error) { + ASSERT(error != -ENOENT); + goto out_trans_cancel; + } + + if (xfs_has_parent(sc->mp)) { + error = xfs_parent_removename(sc->tp, &dl->ppargs, dp, + &dl->xname, sc->ip); + if (error) + goto out_trans_cancel; + } + + /* + * Notify dirent hooks that we removed the bad link, invalidate the + * dcache, and commit the repair. + */ + xfs_dir_update_hook(dp, sc->ip, -1, &dl->xname); + error = xrep_dirtree_purge_dentry(dl, dp, &dl->xname); + if (error) + goto out_trans_cancel; + + error = xrep_trans_commit(sc); + goto out_ilock; + +out_trans_cancel: + xchk_trans_cancel(sc); +out_ilock: + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); +out_iolock: + xfs_iunlock(dp, XFS_IOLOCK_EXCL); + return error; +} + +/* + * Delete a directory entry that points to this directory. Returns -ESTALE + * if the scan data are now out of date. + */ +STATIC int +xrep_dirtree_delete_path( + struct xchk_dirtree *dl, + struct xchk_dirpath *path) +{ + struct xchk_dirpath_step step; + struct xfs_scrub *sc = dl->sc; + struct xfs_inode *dp; + int error; + + /* + * Load the parent pointer and directory inode for this path, then + * drop the scan lock, the ILOCK, and the transaction so that + * _delete_path can reserve the proper transaction. This sets up + * @dl->xname for the deletion. + */ + error = xrep_dirtree_prep_path(dl, path, &step); + if (error) + return error; + + error = xchk_iget(sc, be64_to_cpu(step.pptr_rec.p_ino), &dp); + if (error) + return error; + + mutex_unlock(&dl->lock); + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Delete the directory link and release the parent. */ + error = xrep_dirtree_unlink(dl, dp, path, &step); + xchk_irele(sc, dp); + + /* + * Retake all the resources we had at the beginning even if the repair + * failed or the scan data are now stale. This keeps things simple for + * the caller. + */ + xchk_trans_alloc_empty(sc); + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + + if (!error && dl->stale) + error = -ESTALE; + return error; +} + +/* Add a new path to represent our in-progress adoption. */ +STATIC int +xrep_dirtree_create_adoption_path( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + struct xchk_dirpath *path; + int error; + + /* + * We should have capped the number of paths at XFS_MAXLINK-1 in the + * scanner. + */ + if (dl->nr_paths > XFS_MAXLINK) { + ASSERT(dl->nr_paths <= XFS_MAXLINK); + return -EFSCORRUPTED; + } + + /* + * Create a new xchk_path structure to remember this parent pointer + * and record the first name step. + */ + path = kmalloc(sizeof(struct xchk_dirpath), XCHK_GFP_FLAGS); + if (!path) + return -ENOMEM; + + INIT_LIST_HEAD(&path->list); + xino_bitmap_init(&path->seen_inodes); + path->nr_steps = 0; + path->outcome = XREP_DIRPATH_ADOPTING; + + /* + * Record the new link that we just created in the orphanage. Because + * adoption is the last repair that we perform, we don't bother filling + * in the path all the way back to the root. + */ + xfs_inode_to_parent_rec(&dl->pptr_rec, sc->orphanage); + + error = xino_bitmap_set(&path->seen_inodes, sc->orphanage->i_ino); + if (error) + goto out_path; + + trace_xrep_dirtree_create_adoption(sc, sc->ip, dl->nr_paths, + &dl->xname, &dl->pptr_rec); + + error = xchk_dirpath_append(dl, sc->ip, path, &dl->xname, + &dl->pptr_rec); + if (error) + goto out_path; + + path->first_step = xfarray_length(dl->path_steps) - 1; + path->second_step = XFARRAY_NULLIDX; + path->path_nr = dl->nr_paths; + + list_add_tail(&path->list, &dl->path_list); + dl->nr_paths++; + return 0; + +out_path: + kfree(path); + return error; +} + +/* + * Prepare to move a file to the orphanage by taking the IOLOCK of the + * orphanage and the child (scrub target). Caller must hold IOLOCK_EXCL on + * @sc->ip. Returns 0 if we took both locks, or a negative errno if we + * couldn't lock the orphanage in time. + */ +static inline int +xrep_dirtree_adopt_iolock( + struct xfs_scrub *sc) +{ + int error; + + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + + if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + return 0; + + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + do { + xrep_orphanage_ilock(sc, XFS_IOLOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + return error; + } + + delay(1); + } while (1); + + return 0; +} + +/* + * Reattach this orphaned directory to the orphanage. Do not call this with + * any resources held. Returns -ESTALE if the scan data have become out of + * date. + */ +STATIC int +xrep_dirtree_adopt( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* Take the IOLOCK of the orphanage and the scrub target. */ + error = xrep_dirtree_adopt_iolock(sc); + if (error) + return error; + + /* + * Set up for an adoption. The directory tree fixer runs after the + * link counts have been corrected. Therefore, we must bump the + * child's link count since there will be no further opportunity to fix + * errors. + */ + error = xrep_adoption_trans_alloc(sc, &dl->adoption); + if (error) + goto out_iolock; + dl->adoption.bump_child_nlink = true; + + /* Figure out what name we're going to use here. */ + error = xrep_adoption_compute_name(&dl->adoption, &dl->xname); + if (error) + goto out_trans; + + /* + * Now that we have a proposed name for the orphanage entry, create + * a faux path so that the live update hook will see it. + */ + mutex_lock(&dl->lock); + if (dl->stale) { + mutex_unlock(&dl->lock); + error = -ESTALE; + goto out_trans; + } + error = xrep_dirtree_create_adoption_path(dl); + mutex_unlock(&dl->lock); + if (error) + goto out_trans; + + /* Reparent the directory. */ + error = xrep_adoption_move(&dl->adoption); + if (error) + goto out_trans; + + /* + * Commit the name and release all inode locks except for the scrub + * target's IOLOCK. + */ + error = xrep_trans_commit(sc); + goto out_ilock; + +out_trans: + xchk_trans_cancel(sc); +out_ilock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); +out_iolock: + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return error; +} + +/* + * This newly orphaned directory needs to be adopted by the orphanage. + * Make this happen. + */ +STATIC int +xrep_dirtree_move_to_orphanage( + struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + int error; + + /* + * Start by dropping all the resources that we hold so that we can grab + * all the resources that we need for the adoption. + */ + mutex_unlock(&dl->lock); + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Perform the adoption. */ + error = xrep_dirtree_adopt(dl); + + /* + * Retake all the resources we had at the beginning even if the repair + * failed or the scan data are now stale. This keeps things simple for + * the caller. + */ + xchk_trans_alloc_empty(sc); + xchk_ilock(sc, XFS_ILOCK_EXCL); + mutex_lock(&dl->lock); + + if (!error && dl->stale) + error = -ESTALE; + return error; +} + +/* + * Try to fix all the problems. Returns -ESTALE if the scan data have become + * out of date. + */ +STATIC int +xrep_dirtree_fix_problems( + struct xchk_dirtree *dl, + struct xchk_dirtree_outcomes *oc) +{ + struct xchk_dirpath *path; + int error; + + /* Delete all the paths we don't want. */ + xchk_dirtree_for_each_path(dl, path) { + if (path->outcome != XCHK_DIRPATH_DELETE) + continue; + + error = xrep_dirtree_delete_path(dl, path); + if (error) + return error; + } + + /* Reparent this directory to the orphanage. */ + if (oc->needs_adoption) { + if (xrep_orphanage_can_adopt(dl->sc)) + return xrep_dirtree_move_to_orphanage(dl); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Fix directory loops involving this directory. */ +int +xrep_dirtree( + struct xfs_scrub *sc) +{ + struct xchk_dirtree *dl = sc->buf; + struct xchk_dirtree_outcomes oc; + int error; + + /* + * Prepare to fix the directory tree by retaking the scan lock. The + * order of resource acquisition is still IOLOCK -> transaction -> + * ILOCK -> scan lock. + */ + mutex_lock(&dl->lock); + do { + /* + * Decide what we're going to do, then do it. An -ESTALE + * return here means the scan results are invalid and we have + * to walk again. + */ + if (!dl->stale) { + xrep_dirtree_decide_fate(dl, &oc); + + trace_xrep_dirtree_decided_fate(dl, &oc); + + error = xrep_dirtree_fix_problems(dl, &oc); + if (!error || error != -ESTALE) + break; + } + error = xchk_dirtree_find_paths_to_root(dl); + if (error == -ELNRNG || error == -ENOSR) + error = -EFSCORRUPTED; + } while (!error); + mutex_unlock(&dl->lock); + + return error; +} diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c new file mode 100644 index 000000000000..01766041ba2c --- /dev/null +++ b/fs/xfs/scrub/findparent.c @@ -0,0 +1,454 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_trans_space.h" +#include "xfs_health.h" +#include "xfs_exchmaps.h" +#include "xfs_parent.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/findparent.h" +#include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/listxattr.h" + +/* + * Finding the Parent of a Directory + * ================================= + * + * Directories have parent pointers, in the sense that each directory contains + * a dotdot entry that points to the single allowed parent. The brute force + * way to find the parent of a given directory is to scan every directory in + * the filesystem looking for a child dirent that references this directory. + * + * This module wraps the process of scanning the directory tree. It requires + * that @sc->ip is the directory whose parent we want to find, and that the + * caller hold only the IOLOCK on that directory. The scan itself needs to + * take the ILOCK of each directory visited. + * + * Because we cannot hold @sc->ip's ILOCK during a scan of the whole fs, it is + * necessary to use dirent hook to update the parent scan results. Callers + * must not read the scan results without re-taking @sc->ip's ILOCK. + * + * There are a few shortcuts that we can take to avoid scanning the entire + * filesystem, such as noticing directory tree roots and querying the dentry + * cache for parent information. + */ + +struct xrep_findparent_info { + /* The directory currently being scanned. */ + struct xfs_inode *dp; + + /* + * Scrub context. We're looking for a @dp containing a directory + * entry pointing to sc->ip->i_ino. + */ + struct xfs_scrub *sc; + + /* Optional scan information for a xrep_findparent_scan call. */ + struct xrep_parent_scan_info *parent_scan; + + /* + * Parent that we've found for sc->ip. If we're scanning the entire + * directory tree, we need this to ensure that we only find /one/ + * parent directory. + */ + xfs_ino_t found_parent; + + /* + * This is set to true if @found_parent was not observed directly from + * the directory scan but by noticing a change in dotdot entries after + * cycling the sc->ip IOLOCK. + */ + bool parent_tentative; +}; + +/* + * If this directory entry points to the scrub target inode, then the directory + * we're scanning is the parent of the scrub target inode. + */ +STATIC int +xrep_findparent_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_findparent_info *fpi = priv; + int error = 0; + + if (xchk_should_terminate(fpi->sc, &error)) + return error; + + if (ino != fpi->sc->ip->i_ino) + return 0; + + /* Ignore garbage directory entry names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* + * Ignore dotdot and dot entries -- we're looking for parent -> child + * links only. + */ + if (name->name[0] == '.' && (name->len == 1 || + (name->len == 2 && name->name[1] == '.'))) + return 0; + + /* Uhoh, more than one parent for a dir? */ + if (fpi->found_parent != NULLFSINO && + !(fpi->parent_tentative && fpi->found_parent == fpi->dp->i_ino)) { + trace_xrep_findparent_dirent(fpi->sc->ip, 0); + return -EFSCORRUPTED; + } + + /* We found a potential parent; remember this. */ + trace_xrep_findparent_dirent(fpi->sc->ip, fpi->dp->i_ino); + fpi->found_parent = fpi->dp->i_ino; + fpi->parent_tentative = false; + + if (fpi->parent_scan) + xrep_findparent_scan_found(fpi->parent_scan, fpi->dp->i_ino); + + return 0; +} + +/* + * If this is a directory, walk the dirents looking for any that point to the + * scrub target inode. + */ +STATIC int +xrep_findparent_walk_directory( + struct xrep_findparent_info *fpi) +{ + struct xfs_scrub *sc = fpi->sc; + struct xfs_inode *dp = fpi->dp; + unsigned int lock_mode; + int error = 0; + + /* + * The inode being scanned cannot be its own parent, nor can any + * temporary directory we created to stage this repair. + */ + if (dp == sc->ip || dp == sc->tempip) + return 0; + + /* + * Similarly, temporary files created to stage a repair cannot be the + * parent of this inode. + */ + if (xrep_is_tempfile(dp)) + return 0; + + /* + * Scan the directory to see if there it contains an entry pointing to + * the directory that we are repairing. + */ + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * If this directory is known to be sick, we cannot scan it reliably + * and must abort. + */ + if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | + XFS_SICK_INO_BMBTD | + XFS_SICK_INO_DIR)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + /* + * We cannot complete our parent pointer scan if a directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_dir_walk(sc, dp, xrep_findparent_dirent, fpi); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lock_mode); + return error; +} + +/* + * Update this directory's dotdot pointer based on ongoing dirent updates. + */ +STATIC int +xrep_findparent_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_parent_scan_info *pscan; + struct xfs_scrub *sc; + + pscan = container_of(nb, struct xrep_parent_scan_info, + dhook.dirent_hook.nb); + sc = pscan->sc; + + /* + * If @p->ip is the subdirectory that we're interested in and we've + * already scanned @p->dp, update the dotdot target inumber to the + * parent inode. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&pscan->iscan, p->dp->i_ino)) { + if (p->delta > 0) { + xrep_findparent_scan_found(pscan, p->dp->i_ino); + } else { + xrep_findparent_scan_found(pscan, NULLFSINO); + } + } + + return NOTIFY_DONE; +} + +/* + * Set up a scan to find the parent of a directory. The provided dirent hook + * will be called when there is a dotdot update for the inode being repaired. + */ +int +__xrep_findparent_scan_start( + struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan, + notifier_fn_t custom_fn) +{ + int error; + + if (!(sc->flags & XCHK_FSGATES_DIRENTS)) { + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + return -EINVAL; + } + + pscan->sc = sc; + pscan->parent_ino = NULLFSINO; + + mutex_init(&pscan->lock); + + xchk_iscan_start(sc, 30000, 100, &pscan->iscan); + + /* + * Hook into the dirent update code. The hook only operates on inodes + * that were already scanned, and the scanner thread takes each inode's + * ILOCK, which means that any in-progress inode updates will finish + * before we can scan the inode. + */ + if (custom_fn) + xfs_dir_hook_setup(&pscan->dhook, custom_fn); + else + xfs_dir_hook_setup(&pscan->dhook, xrep_findparent_live_update); + error = xfs_dir_hook_add(sc->mp, &pscan->dhook); + if (error) + goto out_iscan; + + return 0; +out_iscan: + xchk_iscan_teardown(&pscan->iscan); + mutex_destroy(&pscan->lock); + return error; +} + +/* + * Scan the entire filesystem looking for a parent inode for the inode being + * scrubbed. @sc->ip must not be the root of a directory tree. Callers must + * not hold a dirty transaction or any lock that would interfere with taking + * an ILOCK. + * + * Returns 0 with @pscan->parent_ino set to the parent that we found. + * Returns 0 with @pscan->parent_ino set to NULLFSINO if we found no parents. + * Returns the usual negative errno if something else happened. + */ +int +xrep_findparent_scan( + struct xrep_parent_scan_info *pscan) +{ + struct xrep_findparent_info fpi = { + .sc = pscan->sc, + .found_parent = NULLFSINO, + .parent_scan = pscan, + }; + struct xfs_scrub *sc = pscan->sc; + int ret; + + ASSERT(S_ISDIR(VFS_IC(sc->ip)->i_mode)); + + while ((ret = xchk_iscan_iter(&pscan->iscan, &fpi.dp)) == 1) { + if (S_ISDIR(VFS_I(fpi.dp)->i_mode)) + ret = xrep_findparent_walk_directory(&fpi); + else + ret = 0; + xchk_iscan_mark_visited(&pscan->iscan, fpi.dp); + xchk_irele(sc, fpi.dp); + if (ret) + break; + + if (xchk_should_terminate(sc, &ret)) + break; + } + xchk_iscan_iter_finish(&pscan->iscan); + + return ret; +} + +/* Tear down a parent scan. */ +void +xrep_findparent_scan_teardown( + struct xrep_parent_scan_info *pscan) +{ + xfs_dir_hook_del(pscan->sc->mp, &pscan->dhook); + xchk_iscan_teardown(&pscan->iscan); + mutex_destroy(&pscan->lock); +} + +/* Finish a parent scan early. */ +void +xrep_findparent_scan_finish_early( + struct xrep_parent_scan_info *pscan, + xfs_ino_t ino) +{ + xrep_findparent_scan_found(pscan, ino); + xchk_iscan_finish_early(&pscan->iscan); +} + +/* + * Confirm that the directory @parent_ino actually contains a directory entry + * pointing to the child @sc->ip->ino. This function returns one of several + * ways: + * + * Returns 0 with @parent_ino unchanged if the parent was confirmed. + * Returns 0 with @parent_ino set to NULLFSINO if the parent was not valid. + * Returns the usual negative errno if something else happened. + */ +int +xrep_findparent_confirm( + struct xfs_scrub *sc, + xfs_ino_t *parent_ino) +{ + struct xrep_findparent_info fpi = { + .sc = sc, + .found_parent = NULLFSINO, + }; + int error; + + /* + * The root directory always points to itself. Unlinked dirs can point + * anywhere, so we point them at the root dir too. + */ + if (sc->ip == sc->mp->m_rootip || VFS_I(sc->ip)->i_nlink == 0) { + *parent_ino = sc->mp->m_sb.sb_rootino; + return 0; + } + + /* Reject garbage parent inode numbers and self-referential parents. */ + if (*parent_ino == NULLFSINO) + return 0; + if (!xfs_verify_dir_ino(sc->mp, *parent_ino) || + *parent_ino == sc->ip->i_ino) { + *parent_ino = NULLFSINO; + return 0; + } + + error = xchk_iget(sc, *parent_ino, &fpi.dp); + if (error) + return error; + + if (!S_ISDIR(VFS_I(fpi.dp)->i_mode)) { + *parent_ino = NULLFSINO; + goto out_rele; + } + + error = xrep_findparent_walk_directory(&fpi); + if (error) + goto out_rele; + + *parent_ino = fpi.found_parent; +out_rele: + xchk_irele(sc, fpi.dp); + return error; +} + +/* + * If we're the root of a directory tree, we are our own parent. If we're an + * unlinked directory, the parent /won't/ have a link to us. Set the parent + * directory to the root for both cases. Returns NULLFSINO if we don't know + * what to do. + */ +xfs_ino_t +xrep_findparent_self_reference( + struct xfs_scrub *sc) +{ + if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino) + return sc->mp->m_sb.sb_rootino; + + if (VFS_I(sc->ip)->i_nlink == 0) + return sc->mp->m_sb.sb_rootino; + + return NULLFSINO; +} + +/* Check the dentry cache to see if knows of a parent for the scrub target. */ +xfs_ino_t +xrep_findparent_from_dcache( + struct xfs_scrub *sc) +{ + struct inode *pip = NULL; + struct dentry *dentry, *parent; + xfs_ino_t ret = NULLFSINO; + + dentry = d_find_alias(VFS_I(sc->ip)); + if (!dentry) + goto out; + + parent = dget_parent(dentry); + if (!parent) + goto out_dput; + + ASSERT(parent->d_sb == sc->ip->i_mount->m_super); + + pip = igrab(d_inode(parent)); + dput(parent); + + if (S_ISDIR(pip->i_mode)) { + trace_xrep_findparent_from_dcache(sc->ip, XFS_I(pip)->i_ino); + ret = XFS_I(pip)->i_ino; + } + + xchk_irele(sc, XFS_I(pip)); + +out_dput: + dput(dentry); +out: + return ret; +} diff --git a/fs/xfs/scrub/findparent.h b/fs/xfs/scrub/findparent.h new file mode 100644 index 000000000000..d998c7a88152 --- /dev/null +++ b/fs/xfs/scrub/findparent.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FINDPARENT_H__ +#define __XFS_SCRUB_FINDPARENT_H__ + +struct xrep_parent_scan_info { + struct xfs_scrub *sc; + + /* Inode scan cursor. */ + struct xchk_iscan iscan; + + /* Hook to capture directory entry updates. */ + struct xfs_dir_hook dhook; + + /* Lock protecting parent_ino. */ + struct mutex lock; + + /* Parent inode that we've found. */ + xfs_ino_t parent_ino; + + bool lookup_parent; +}; + +int __xrep_findparent_scan_start(struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan, + notifier_fn_t custom_fn); +static inline int xrep_findparent_scan_start(struct xfs_scrub *sc, + struct xrep_parent_scan_info *pscan) +{ + return __xrep_findparent_scan_start(sc, pscan, NULL); +} +int xrep_findparent_scan(struct xrep_parent_scan_info *pscan); +void xrep_findparent_scan_teardown(struct xrep_parent_scan_info *pscan); + +static inline void +xrep_findparent_scan_found( + struct xrep_parent_scan_info *pscan, + xfs_ino_t ino) +{ + mutex_lock(&pscan->lock); + pscan->parent_ino = ino; + mutex_unlock(&pscan->lock); +} + +void xrep_findparent_scan_finish_early(struct xrep_parent_scan_info *pscan, + xfs_ino_t ino); + +int xrep_findparent_confirm(struct xfs_scrub *sc, xfs_ino_t *parent_ino); + +xfs_ino_t xrep_findparent_self_reference(struct xfs_scrub *sc); +xfs_ino_t xrep_findparent_from_dcache(struct xfs_scrub *sc); + +#endif /* __XFS_SCRUB_FINDPARENT_H__ */ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index d310737c8823..1d3e98346933 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -85,7 +85,7 @@ xchk_fscount_warmup( continue; /* Lock both AG headers. */ - error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); if (error) break; error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); @@ -412,10 +412,11 @@ xchk_fscount_count_frextents( int error; fsc->frextents = 0; + fsc->frextents_delayed = 0; if (!xfs_has_realtime(mp)) return 0; - xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP); error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_fscount_add_frextent, fsc); if (error) { @@ -423,8 +424,10 @@ xchk_fscount_count_frextents( goto out_unlock; } + fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents); + out_unlock: - xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP); return error; } #else @@ -434,6 +437,7 @@ xchk_fscount_count_frextents( struct xchk_fscounters *fsc) { fsc->frextents = 0; + fsc->frextents_delayed = 0; return 0; } #endif /* CONFIG_XFS_RT */ @@ -517,7 +521,7 @@ xchk_fscounters( /* * If the filesystem is not frozen, the counter summation calls above - * can race with xfs_mod_freecounter, which subtracts a requested space + * can race with xfs_dec_freecounter, which subtracts a requested space * reservation from the counter and undoes the subtraction if that made * the counter go negative. Therefore, it's possible to see negative * values here, and we should only flag that as a corruption if we @@ -593,7 +597,7 @@ xchk_fscounters( } if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, - fsc->frextents)) { + fsc->frextents - fsc->frextents_delayed)) { if (fsc->frozen) xchk_set_corrupt(sc); else diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h index 461a13d25f4b..bcf56e1c36f9 100644 --- a/fs/xfs/scrub/fscounters.h +++ b/fs/xfs/scrub/fscounters.h @@ -12,6 +12,7 @@ struct xchk_fscounters { uint64_t ifree; uint64_t fdblocks; uint64_t frextents; + uint64_t frextents_delayed; unsigned long long icount_min; unsigned long long icount_max; bool frozen; diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c index 94cdb852bee4..469bf645dbea 100644 --- a/fs/xfs/scrub/fscounters_repair.c +++ b/fs/xfs/scrub/fscounters_repair.c @@ -65,7 +65,17 @@ xrep_fscounters( percpu_counter_set(&mp->m_icount, fsc->icount); percpu_counter_set(&mp->m_ifree, fsc->ifree); percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); - percpu_counter_set(&mp->m_frextents, fsc->frextents); + + /* + * Online repair is only supported on v5 file systems, which require + * lazy sb counters and thus no update of sb_fdblocks here. But as of + * now we don't support lazy counting sb_frextents yet, and thus need + * to also update it directly here. And for that we need to keep + * track of the delalloc reservations separately, as they are are + * subtracted from m_frextents, but not included in sb_frextents. + */ + percpu_counter_set(&mp->m_frextents, + fsc->frextents - fsc->frextents_delayed); mp->m_sb.sb_frextents = fsc->frextents; return 0; diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 9020a6bef7f1..b712a8bd34f5 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -108,6 +108,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, + [XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE }, }; /* Return the health status mask for this scrub type. */ diff --git a/fs/xfs/scrub/ino_bitmap.h b/fs/xfs/scrub/ino_bitmap.h new file mode 100644 index 000000000000..1300833679ab --- /dev/null +++ b/fs/xfs/scrub/ino_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_INO_BITMAP_H__ +#define __XFS_SCRUB_INO_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_ino_t */ + +struct xino_bitmap { + struct xbitmap64 inobitmap; +}; + +static inline void xino_bitmap_init(struct xino_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->inobitmap); +} + +static inline void xino_bitmap_destroy(struct xino_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->inobitmap); +} + +static inline int xino_bitmap_set(struct xino_bitmap *bitmap, xfs_ino_t ino) +{ + return xbitmap64_set(&bitmap->inobitmap, ino, 1); +} + +static inline int xino_bitmap_test(struct xino_bitmap *bitmap, xfs_ino_t ino) +{ + uint64_t len = 1; + + return xbitmap64_test(&bitmap->inobitmap, ino, &len); +} + +#endif /* __XFS_SCRUB_INO_BITMAP_H__ */ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 6e2fe2d6250b..d32716fb2fec 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -739,6 +739,23 @@ xchk_inode_check_reflink_iflag( xchk_ino_set_corrupt(sc, ino); } +/* + * If this inode has zero link count, it must be on the unlinked list. If + * it has nonzero link count, it must not be on the unlinked list. + */ +STATIC void +xchk_inode_check_unlinked( + struct xfs_scrub *sc) +{ + if (VFS_I(sc->ip)->i_nlink == 0) { + if (!xfs_inode_on_unlinked_list(sc->ip)) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (xfs_inode_on_unlinked_list(sc->ip)) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } +} + /* Scrub an inode. */ int xchk_inode( @@ -771,6 +788,8 @@ xchk_inode( if (S_ISREG(VFS_I(sc->ip)->i_mode)) xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino); + xchk_inode_check_unlinked(sc); + xchk_inode_xref(sc, sc->ip->i_ino, &di); out: return error; diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index eab380e95ef4..daf9f1ee7c2c 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -46,6 +46,7 @@ #include "scrub/repair.h" #include "scrub/iscan.h" #include "scrub/readdir.h" +#include "scrub/tempfile.h" /* * Inode Record Repair @@ -282,6 +283,51 @@ xrep_dinode_findmode_dirent( return 0; } +/* Try to lock a directory, or wait a jiffy. */ +static inline int +xrep_dinode_ilock_nowait( + struct xfs_inode *dp, + unsigned int lock_mode) +{ + if (xfs_ilock_nowait(dp, lock_mode)) + return true; + + schedule_timeout_killable(1); + return false; +} + +/* + * Try to lock a directory to look for ftype hints. Since we already hold the + * AGI buffer, we cannot block waiting for the ILOCK because rename can take + * the ILOCK and then try to lock AGIs. + */ +STATIC int +xrep_dinode_trylock_directory( + struct xrep_inode *ri, + struct xfs_inode *dp, + unsigned int *lock_modep) +{ + unsigned long deadline = jiffies + msecs_to_jiffies(30000); + unsigned int lock_mode; + int error = 0; + + do { + if (xchk_should_terminate(ri->sc, &error)) + return error; + + if (xfs_need_iread_extents(&dp->i_df)) + lock_mode = XFS_ILOCK_EXCL; + else + lock_mode = XFS_ILOCK_SHARED; + + if (xrep_dinode_ilock_nowait(dp, lock_mode)) { + *lock_modep = lock_mode; + return 0; + } + } while (!time_is_before_jiffies(deadline)); + return -EBUSY; +} + /* * If this is a directory, walk the dirents looking for any that point to the * scrub target inode. @@ -295,11 +341,17 @@ xrep_dinode_findmode_walk_directory( unsigned int lock_mode; int error = 0; + /* Ignore temporary repair directories. */ + if (xrep_is_tempfile(dp)) + return 0; + /* * Scan the directory to see if there it contains an entry pointing to * the directory that we are repairing. */ - lock_mode = xfs_ilock_data_map_shared(dp); + error = xrep_dinode_trylock_directory(ri, dp, &lock_mode); + if (error) + return error; /* * If this directory is known to be sick, we cannot scan it reliably @@ -356,6 +408,7 @@ xrep_dinode_find_mode( * so there's a real possibility that _iscan_iter can return EBUSY. */ xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); + xchk_iscan_set_agi_trylock(&ri->ftype_iscan); ri->ftype_iscan.skip_ino = sc->sm->sm_ino; ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { @@ -463,6 +516,17 @@ xrep_dinode_mode( return 0; } +/* Fix unused link count fields having nonzero values. */ +STATIC void +xrep_dinode_nlinks( + struct xfs_dinode *dip) +{ + if (dip->di_version > 1) + dip->di_onlink = 0; + else + dip->di_nlink = 0; +} + /* Fix any conflicting flags that the verifiers complain about. */ STATIC void xrep_dinode_flags( @@ -1324,6 +1388,7 @@ xrep_dinode_core( iget_error = xrep_dinode_mode(ri, dip); if (iget_error) goto write; + xrep_dinode_nlinks(dip); xrep_dinode_flags(sc, dip, ri->rt_extents > 0); xrep_dinode_size(ri, dip); xrep_dinode_extsize_hints(sc, dip); @@ -1671,6 +1736,44 @@ xrep_inode_extsize( } } +/* Ensure this file has an attr fork if it needs to hold a parent pointer. */ +STATIC int +xrep_inode_pptr( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct inode *inode = VFS_I(ip); + + if (!xfs_has_parent(mp)) + return 0; + + /* + * Unlinked inodes that cannot be added to the directory tree will not + * have a parent pointer. + */ + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + return 0; + + /* The root directory doesn't have a parent pointer. */ + if (ip == mp->m_rootip) + return 0; + + /* + * Metadata inodes are rooted in the superblock and do not have any + * parents. + */ + if (xfs_is_metadata_inode(ip)) + return 0; + + /* Inode already has an attr fork; no further work possible here. */ + if (xfs_inode_has_attr_fork(ip)) + return 0; + + return xfs_bmap_add_attrfork(sc->tp, ip, + sizeof(struct xfs_attr_sf_hdr), true); +} + /* Fix any irregularities in an inode that the verifiers don't catch. */ STATIC int xrep_inode_problems( @@ -1681,6 +1784,9 @@ xrep_inode_problems( error = xrep_inode_blockcounts(sc); if (error) return error; + error = xrep_inode_pptr(sc); + if (error) + return error; xrep_inode_timestamps(sc->ip); xrep_inode_flags(sc); xrep_inode_ids(sc); @@ -1697,6 +1803,46 @@ xrep_inode_problems( return xrep_roll_trans(sc); } +/* + * Make sure this inode's unlinked list pointers are consistent with its + * link count. + */ +STATIC int +xrep_inode_unlinked( + struct xfs_scrub *sc) +{ + unsigned int nlink = VFS_I(sc->ip)->i_nlink; + int error; + + /* + * If this inode is linked from the directory tree and on the unlinked + * list, remove it from the unlinked list. + */ + if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) { + struct xfs_perag *pag; + int error; + + pag = xfs_perag_get(sc->mp, + XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); + error = xfs_iunlink_remove(sc->tp, pag, sc->ip); + xfs_perag_put(pag); + if (error) + return error; + } + + /* + * If this inode is not linked from the directory tree yet not on the + * unlinked list, put it on the unlinked list. + */ + if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) { + error = xfs_iunlink(sc->tp, sc->ip); + if (error) + return error; + } + + return 0; +} + /* Repair an inode's fields. */ int xrep_inode( @@ -1746,5 +1892,10 @@ xrep_inode( return error; } + /* Reconnect incore unlinked list */ + error = xrep_inode_unlinked(sc); + if (error) + return error; + return xrep_defer_finish(sc); } diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index ec3478bc505e..cf9d983667ce 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -243,6 +243,51 @@ xchk_iscan_finish( mutex_unlock(&iscan->lock); } +/* Mark an inode scan finished before we actually scan anything. */ +void +xchk_iscan_finish_early( + struct xchk_iscan *iscan) +{ + ASSERT(iscan->cursor_ino == iscan->scan_start_ino); + ASSERT(iscan->__visited_ino == iscan->scan_start_ino); + + xchk_iscan_finish(iscan); +} + +/* + * Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set, + * -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed, + * or the usual negative errno. + */ +STATIC int +xchk_iscan_read_agi( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_buf **agi_bpp) +{ + struct xfs_scrub *sc = iscan->sc; + unsigned long relax; + int ret; + + if (!xchk_iscan_agi_needs_trylock(iscan)) + return xfs_ialloc_read_agi(pag, sc->tp, 0, agi_bpp); + + relax = msecs_to_jiffies(iscan->iget_retry_delay); + do { + ret = xfs_ialloc_read_agi(pag, sc->tp, XFS_IALLOC_FLAG_TRYLOCK, + agi_bpp); + if (ret != -EAGAIN) + return ret; + if (!iscan->iget_timeout || + time_is_before_jiffies(iscan->__iget_deadline)) + return -EBUSY; + + trace_xchk_iscan_agi_retry_wait(iscan); + } while (!schedule_timeout_killable(relax) && + !xchk_iscan_aborted(iscan)); + return -ECANCELED; +} + /* * Advance ino to the next inode that the inobt thinks is allocated, being * careful to jump to the next AG if we've reached the right end of this AG's @@ -281,7 +326,7 @@ xchk_iscan_advance( if (!pag) return -ECANCELED; - ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + ret = xchk_iscan_read_agi(iscan, pag, &agi_bp); if (ret) goto out_pag; @@ -363,6 +408,15 @@ xchk_iscan_iget_retry( } /* + * For an inode scan, we hold the AGI and want to try to grab a batch of + * inodes. Holding the AGI prevents inodegc from clearing freed inodes, + * so we must use noretry here. For every inode after the first one in the + * batch, we don't want to wait, so we use retry there too. Finally, use + * dontcache to avoid polluting the cache. + */ +#define ISCAN_IGET_FLAGS (XFS_IGET_NORETRY | XFS_IGET_DONTCACHE) + +/* * Grab an inode as part of an inode scan. While scanning this inode, the * caller must ensure that no other threads can modify the inode until a call * to xchk_iscan_visit succeeds. @@ -389,7 +443,7 @@ xchk_iscan_iget( ASSERT(iscan->__inodes[0] == NULL); /* Fill the first slot in the inode array. */ - error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0, &iscan->__inodes[idx]); trace_xchk_iscan_iget(iscan, error); @@ -402,8 +456,13 @@ xchk_iscan_iget( * It's possible that this inode has lost all of its links but * hasn't yet been inactivated. If we don't have a transaction * or it's not writable, flush the inodegc workers and wait. + * If we have a non-empty transaction, we must not block on + * inodegc, which allocates its own transactions. */ - xfs_inodegc_flush(mp); + if (sc->tp && !(sc->tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) + xfs_inodegc_push(mp); + else + xfs_inodegc_flush(mp); return xchk_iscan_iget_retry(iscan, true); } @@ -457,7 +516,7 @@ xchk_iscan_iget( ASSERT(iscan->__inodes[idx] == NULL); - error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + error = xfs_iget(sc->mp, sc->tp, ino, ISCAN_IGET_FLAGS, 0, &iscan->__inodes[idx]); if (error) break; diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h index 71f657552dfa..f9f47fa01a9e 100644 --- a/fs/xfs/scrub/iscan.h +++ b/fs/xfs/scrub/iscan.h @@ -59,6 +59,9 @@ struct xchk_iscan { /* Set if the scan has been aborted due to some event in the fs. */ #define XCHK_ISCAN_OPSTATE_ABORTED (1) +/* Use trylock to acquire the AGI */ +#define XCHK_ISCAN_OPSTATE_TRYLOCK_AGI (2) + static inline bool xchk_iscan_aborted(const struct xchk_iscan *iscan) { @@ -71,8 +74,21 @@ xchk_iscan_abort(struct xchk_iscan *iscan) set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); } +static inline bool +xchk_iscan_agi_needs_trylock(const struct xchk_iscan *iscan) +{ + return test_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); +} + +static inline void +xchk_iscan_set_agi_trylock(struct xchk_iscan *iscan) +{ + set_bit(XCHK_ISCAN_OPSTATE_TRYLOCK_AGI, &iscan->__opstate); +} + void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout, unsigned int iget_retry_delay, struct xchk_iscan *iscan); +void xchk_iscan_finish_early(struct xchk_iscan *iscan); void xchk_iscan_teardown(struct xchk_iscan *iscan); int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp); diff --git a/fs/xfs/scrub/listxattr.c b/fs/xfs/scrub/listxattr.c new file mode 100644 index 000000000000..256ff7700c94 --- /dev/null +++ b/fs/xfs/scrub/listxattr.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" +#include "xfs_trans.h" +#include "scrub/scrub.h" +#include "scrub/bitmap.h" +#include "scrub/dab_bitmap.h" +#include "scrub/listxattr.h" + +/* Call a function for every entry in a shortform xattr structure. */ +STATIC int +xchk_xattr_walk_sf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + void *priv) +{ + struct xfs_attr_sf_hdr *hdr = ip->i_af.if_data; + struct xfs_attr_sf_entry *sfe; + unsigned int i; + int error; + + sfe = xfs_attr_sf_firstentry(hdr); + for (i = 0; i < hdr->count; i++) { + error = attr_fn(sc, ip, sfe->flags, sfe->nameval, sfe->namelen, + &sfe->nameval[sfe->namelen], sfe->valuelen, + priv); + if (error) + return error; + + sfe = xfs_attr_sf_nextentry(sfe); + } + + return 0; +} + +/* Call a function for every entry in this xattr leaf block. */ +STATIC int +xchk_xattr_walk_leaf_entries( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + struct xfs_buf *bp, + void *priv) +{ + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; + unsigned int i; + int error; + + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + for (i = 0; i < ichdr.count; entry++, i++) { + void *value; + unsigned char *name; + unsigned int namelen, valuelen; + + if (entry->flags & XFS_ATTR_LOCAL) { + struct xfs_attr_leaf_name_local *name_loc; + + name_loc = xfs_attr3_leaf_name_local(leaf, i); + name = name_loc->nameval; + namelen = name_loc->namelen; + value = &name_loc->nameval[name_loc->namelen]; + valuelen = be16_to_cpu(name_loc->valuelen); + } else { + struct xfs_attr_leaf_name_remote *name_rmt; + + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + name = name_rmt->name; + namelen = name_rmt->namelen; + value = NULL; + valuelen = be32_to_cpu(name_rmt->valuelen); + } + + error = attr_fn(sc, ip, entry->flags, name, namelen, value, + valuelen, priv); + if (error) + return error; + + } + + return 0; +} + +/* + * Call a function for every entry in a leaf-format xattr structure. Avoid + * memory allocations for the loop detector since there's only one block. + */ +STATIC int +xchk_xattr_walk_leaf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + void *priv) +{ + struct xfs_buf *leaf_bp; + int error; + + error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, 0, &leaf_bp); + if (error) + return error; + + error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, priv); + xfs_trans_brelse(sc->tp, leaf_bp); + return error; +} + +/* Find the leftmost leaf in the xattr dabtree. */ +STATIC int +xchk_xattr_find_leftmost_leaf( + struct xfs_scrub *sc, + struct xfs_inode *ip, + struct xdab_bitmap *seen_dablks, + struct xfs_buf **leaf_bpp) +{ + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_buf *bp; + xfs_failaddr_t fa; + xfs_dablk_t blkno = 0; + unsigned int expected_level = 0; + int error; + + for (;;) { + xfs_extlen_t len = 1; + uint16_t magic; + + /* Make sure we haven't seen this new block already. */ + if (xdab_bitmap_test(seen_dablks, blkno, &len)) + return -EFSCORRUPTED; + + error = xfs_da3_node_read(tp, ip, blkno, &bp, XFS_ATTR_FORK); + if (error) + return error; + + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + + error = -EFSCORRUPTED; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) + goto out_buf; + + fa = xfs_da3_node_header_check(bp, ip->i_ino); + if (fa) + goto out_buf; + + xfs_da3_node_hdr_from_disk(mp, &nodehdr, node); + + if (nodehdr.count == 0 || nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + goto out_buf; + + /* Check the level from the root node. */ + if (blkno == 0) + expected_level = nodehdr.level - 1; + else if (expected_level != nodehdr.level) + goto out_buf; + else + expected_level--; + + /* Remember that we've seen this node. */ + error = xdab_bitmap_set(seen_dablks, blkno, 1); + if (error) + goto out_buf; + + /* Find the next level towards the leaves of the dabtree. */ + btree = nodehdr.btree; + blkno = be32_to_cpu(btree->before); + xfs_trans_brelse(tp, bp); + } + + error = -EFSCORRUPTED; + fa = xfs_attr3_leaf_header_check(bp, ip->i_ino); + if (fa) + goto out_buf; + + if (expected_level != 0) + goto out_buf; + + /* Remember that we've seen this leaf. */ + error = xdab_bitmap_set(seen_dablks, blkno, 1); + if (error) + goto out_buf; + + *leaf_bpp = bp; + return 0; + +out_buf: + xfs_trans_brelse(tp, bp); + return error; +} + +/* Call a function for every entry in a node-format xattr structure. */ +STATIC int +xchk_xattr_walk_node( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + xchk_xattrleaf_fn leaf_fn, + void *priv) +{ + struct xfs_attr3_icleaf_hdr leafhdr; + struct xdab_bitmap seen_dablks; + struct xfs_mount *mp = sc->mp; + struct xfs_attr_leafblock *leaf; + struct xfs_buf *leaf_bp; + int error; + + xdab_bitmap_init(&seen_dablks); + + error = xchk_xattr_find_leftmost_leaf(sc, ip, &seen_dablks, &leaf_bp); + if (error) + goto out_bitmap; + + for (;;) { + xfs_extlen_t len; + + error = xchk_xattr_walk_leaf_entries(sc, ip, attr_fn, leaf_bp, + priv); + if (error) + goto out_leaf; + + /* Find the right sibling of this leaf block. */ + leaf = leaf_bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + if (leafhdr.forw == 0) + goto out_leaf; + + xfs_trans_brelse(sc->tp, leaf_bp); + + if (leaf_fn) { + error = leaf_fn(sc, priv); + if (error) + goto out_bitmap; + } + + /* Make sure we haven't seen this new leaf already. */ + len = 1; + if (xdab_bitmap_test(&seen_dablks, leafhdr.forw, &len)) { + error = -EFSCORRUPTED; + goto out_bitmap; + } + + error = xfs_attr3_leaf_read(sc->tp, ip, ip->i_ino, + leafhdr.forw, &leaf_bp); + if (error) + goto out_bitmap; + + /* Remember that we've seen this new leaf. */ + error = xdab_bitmap_set(&seen_dablks, leafhdr.forw, 1); + if (error) + goto out_leaf; + } + +out_leaf: + xfs_trans_brelse(sc->tp, leaf_bp); +out_bitmap: + xdab_bitmap_destroy(&seen_dablks); + return error; +} + +/* + * Call a function for every extended attribute in a file. + * + * Callers must hold the ILOCK. No validation or cursor restarts allowed. + * Returns -EFSCORRUPTED on any problem, including loops in the dabtree. + */ +int +xchk_xattr_walk( + struct xfs_scrub *sc, + struct xfs_inode *ip, + xchk_xattr_fn attr_fn, + xchk_xattrleaf_fn leaf_fn, + void *priv) +{ + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + if (!xfs_inode_hasattr(ip)) + return 0; + + if (ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) + return xchk_xattr_walk_sf(sc, ip, attr_fn, priv); + + /* attr functions require that the attr fork is loaded */ + error = xfs_iread_extents(sc->tp, ip, XFS_ATTR_FORK); + if (error) + return error; + + if (xfs_attr_is_leaf(ip)) + return xchk_xattr_walk_leaf(sc, ip, attr_fn, priv); + + return xchk_xattr_walk_node(sc, ip, attr_fn, leaf_fn, priv); +} diff --git a/fs/xfs/scrub/listxattr.h b/fs/xfs/scrub/listxattr.h new file mode 100644 index 000000000000..703cfb7b14cf --- /dev/null +++ b/fs/xfs/scrub/listxattr.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_LISTXATTR_H__ +#define __XFS_SCRUB_LISTXATTR_H__ + +typedef int (*xchk_xattr_fn)(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int attr_flags, const unsigned char *name, + unsigned int namelen, const void *value, unsigned int valuelen, + void *priv); + +typedef int (*xchk_xattrleaf_fn)(struct xfs_scrub *sc, void *priv); + +int xchk_xattr_walk(struct xfs_scrub *sc, struct xfs_inode *ip, + xchk_xattr_fn attr_fn, xchk_xattrleaf_fn leaf_fn, void *priv); + +#endif /* __XFS_SCRUB_LISTXATTR_H__ */ diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 8a7d9557897c..80aee30886c4 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -18,15 +18,19 @@ #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_ag.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/iscan.h" +#include "scrub/orphanage.h" #include "scrub/nlinks.h" #include "scrub/trace.h" #include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/listxattr.h" /* * Live Inode Link Count Checking @@ -43,11 +47,23 @@ int xchk_setup_nlinks( struct xfs_scrub *sc) { + struct xchk_nlink_ctrs *xnc; + int error; + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); - sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); - if (!sc->buf) + if (xchk_could_repair(sc)) { + error = xrep_setup_nlinks(sc); + if (error) + return error; + } + + xnc = kvzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); + if (!xnc) return -ENOMEM; + xnc->xname.name = xnc->namebuf; + xnc->sc = sc; + sc->buf = xnc; return xchk_setup_fs(sc); } @@ -152,6 +168,13 @@ xchk_nlinks_live_update( xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb); + /* + * Ignore temporary directories being used to stage dir repairs, since + * we don't bump the link counts of the children. + */ + if (xrep_is_tempfile(p->dp)) + return NOTIFY_DONE; + trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino, p->delta, p->name->name, p->name->len); @@ -251,12 +274,17 @@ xchk_nlinks_collect_dirent( * number of parents of the root directory. * * Otherwise, increment the number of backrefs pointing back to ino. + * + * If the filesystem has parent pointers, we walk the pptrs to + * determine the backref count. */ if (dotdot) { if (dp == sc->mp->m_rootip) error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); - else + else if (!xfs_has_parent(sc->mp)) error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); + else + error = 0; if (error) goto out_unlock; } @@ -293,6 +321,61 @@ out_incomplete: return error; } +/* Bump the backref count for the inode referenced by this parent pointer. */ +STATIC int +xchk_nlinks_collect_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_nlink_ctrs *xnc = priv; + const struct xfs_parent_rec *pptr_rec = value; + xfs_ino_t parent_ino; + int error; + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + trace_xchk_nlinks_collect_pptr(sc->mp, ip, &xname, pptr_rec); + + mutex_lock(&xnc->lock); + + error = xchk_nlinks_update_incore(xnc, parent_ino, 0, 1, 0); + if (error) + goto out_unlock; + + mutex_unlock(&xnc->lock); + return 0; + +out_unlock: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(sc); + return error; +} + /* Walk a directory to bump the observed link counts of the children. */ STATIC int xchk_nlinks_collect_dir( @@ -303,6 +386,13 @@ xchk_nlinks_collect_dir( unsigned int lock_mode; int error = 0; + /* + * Ignore temporary directories being used to stage dir repairs, since + * we don't bump the link counts of the children. + */ + if (xrep_is_tempfile(dp)) + return 0; + /* Prevent anyone from changing this directory while we walk it. */ xfs_ilock(dp, XFS_IOLOCK_SHARED); lock_mode = xfs_ilock_data_map_shared(dp); @@ -332,6 +422,28 @@ xchk_nlinks_collect_dir( if (error) goto out_abort; + /* Walk the parent pointers to get real backref counts. */ + if (xfs_has_parent(sc->mp)) { + /* + * If the extended attributes look as though they has been + * zapped by the inode record repair code, we cannot scan for + * parent pointers. + */ + if (xchk_pptr_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_xattr_walk(sc, dp, xchk_nlinks_collect_pptr, NULL, + xnc); + if (error == -ECANCELED) { + error = 0; + goto out_unlock; + } + if (error) + goto out_abort; + } + xchk_iscan_mark_visited(&xnc->collect_iscan, dp); goto out_unlock; @@ -537,6 +649,14 @@ xchk_nlinks_compare_inode( unsigned int actual_nlink; int error; + /* + * Ignore temporary files being used to stage repairs, since we assume + * they're correct for non-directories, and the directory repair code + * doesn't bump the link counts for the children. + */ + if (xrep_is_tempfile(ip)) + return 0; + xfs_ilock(ip, XFS_ILOCK_SHARED); mutex_lock(&xnc->lock); @@ -571,9 +691,11 @@ xchk_nlinks_compare_inode( * this as a corruption. The VFS won't let users increase the link * count, but it will let them decrease it. */ - if (total_links > XFS_MAXLINK) { + if (total_links > XFS_NLINK_PINNED) { xchk_ino_set_corrupt(sc, ip->i_ino); goto out_corrupt; + } else if (total_links > XFS_MAXLINK) { + xchk_ino_set_warning(sc, ip->i_ino); } /* Link counts should match. */ @@ -850,9 +972,6 @@ xchk_nlinks_setup_scan( xfs_agino_t first_agino, last_agino; int error; - ASSERT(xnc->sc == NULL); - xnc->sc = sc; - mutex_init(&xnc->lock); /* Retry iget every tenth of a second for up to 30 seconds. */ diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h index a950f3daf204..b820712bfd87 100644 --- a/fs/xfs/scrub/nlinks.h +++ b/fs/xfs/scrub/nlinks.h @@ -28,6 +28,13 @@ struct xchk_nlink_ctrs { * from other writer threads. */ struct xfs_dir_hook dhook; + + /* Orphanage reparenting request. */ + struct xrep_adoption adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; }; /* diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c index b87618322f55..b3e707f47b7b 100644 --- a/fs/xfs/scrub/nlinks_repair.c +++ b/fs/xfs/scrub/nlinks_repair.c @@ -17,14 +17,19 @@ #include "xfs_iwalk.h" #include "xfs_ialloc.h" #include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/iscan.h" +#include "scrub/orphanage.h" #include "scrub/nlinks.h" #include "scrub/trace.h" +#include "scrub/tempfile.h" /* * Live Inode Link Count Repair @@ -36,6 +41,48 @@ * inode is locked. */ +/* Set up to repair inode link counts. */ +int +xrep_setup_nlinks( + struct xfs_scrub *sc) +{ + return xrep_orphanage_try_create(sc); +} + +/* + * Inodes that aren't the root directory or the orphanage, have a nonzero link + * count, and no observed parents should be moved to the orphanage. + */ +static inline bool +xrep_nlinks_is_orphaned( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int actual_nlink, + const struct xchk_nlink *obs) +{ + struct xfs_mount *mp = ip->i_mount; + + if (obs->parents != 0) + return false; + if (ip == mp->m_rootip || ip == sc->orphanage) + return false; + return actual_nlink != 0; +} + +/* Remove an inode from the unlinked list. */ +STATIC int +xrep_nlinks_iunlink_remove( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag; + int error; + + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); + error = xfs_iunlink_remove(sc->tp, pag, sc->ip); + xfs_perag_put(pag); + return error; +} + /* * Correct the link count of the given inode. Because we have to grab locks * and resources in a certain order, it's possible that this will be a no-op. @@ -50,17 +97,55 @@ xrep_nlinks_repair_inode( struct xfs_inode *ip = sc->ip; uint64_t total_links; uint64_t actual_nlink; + bool orphanage_available = false; bool dirty = false; int error; - xchk_ilock(sc, XFS_IOLOCK_EXCL); + /* + * Ignore temporary files being used to stage repairs, since we assume + * they're correct for non-directories, and the directory repair code + * doesn't bump the link counts for the children. + */ + if (xrep_is_tempfile(ip)) + return 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp); - if (error) - return error; + /* + * If the filesystem has an orphanage attached to the scrub context, + * prepare for a link count repair that could involve @ip being adopted + * by the lost+found. + */ + if (xrep_orphanage_can_adopt(sc)) { + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; - xchk_ilock(sc, XFS_ILOCK_EXCL); - xfs_trans_ijoin(sc->tp, ip, 0); + error = xrep_adoption_trans_alloc(sc, &xnc->adoption); + if (error) { + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } else { + orphanage_available = true; + } + } + + /* + * Either there is no orphanage or we couldn't allocate resources for + * that kind of update. Let's try again with only the resources we + * need for a simple link count update, since that's much more common. + */ + if (!orphanage_available) { + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, + &sc->tp); + if (error) { + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + return error; + } + + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_trans_ijoin(sc->tp, ip, 0); + } mutex_lock(&xnc->lock); @@ -99,28 +184,68 @@ xrep_nlinks_repair_inode( } /* - * We did not find any links to this inode. If the inode agrees, we - * have nothing further to do. If not, the inode has a nonzero link - * count and we don't have anywhere to graft the child onto. Dropping - * a live inode's link count to zero can cause unexpected shutdowns in - * inactivation, so leave it alone. + * Decide if we're going to move this file to the orphanage, and fix + * up the incore link counts if we are. */ - if (total_links == 0) { - if (actual_nlink != 0) - trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); - goto out_trans; + if (orphanage_available && + xrep_nlinks_is_orphaned(sc, ip, actual_nlink, &obs)) { + /* Figure out what name we're going to use here. */ + error = xrep_adoption_compute_name(&xnc->adoption, &xnc->xname); + if (error) + goto out_trans; + + /* + * Reattach this file to the directory tree by moving it to + * the orphanage per the adoption parameters that we already + * computed. + */ + error = xrep_adoption_move(&xnc->adoption); + if (error) + goto out_trans; + + /* + * Re-read the link counts since the reparenting will have + * updated our scan info. + */ + mutex_lock(&xnc->lock); + error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs); + mutex_unlock(&xnc->lock); + if (error) + goto out_trans; + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + dirty = true; } - /* Commit the new link count if it changed. */ - if (total_links != actual_nlink) { - if (total_links > XFS_MAXLINK) { - trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + /* + * If this inode is linked from the directory tree and on the unlinked + * list, remove it from the unlinked list. + */ + if (total_links > 0 && xfs_inode_on_unlinked_list(ip)) { + error = xrep_nlinks_iunlink_remove(sc); + if (error) goto out_trans; - } + dirty = true; + } + /* + * If this inode is not linked from the directory tree yet not on the + * unlinked list, put it on the unlinked list. + */ + if (total_links == 0 && !xfs_inode_on_unlinked_list(ip)) { + error = xfs_iunlink(sc->tp, ip); + if (error) + goto out_trans; + dirty = true; + } + + /* Commit the new link count if it changed. */ + if (total_links != actual_nlink) { trace_xrep_nlinks_update_inode(mp, ip, &obs); - set_nlink(VFS_I(ip), total_links); + set_nlink(VFS_I(ip), min_t(unsigned long long, total_links, + XFS_NLINK_PINNED)); dirty = true; } @@ -132,14 +257,19 @@ xrep_nlinks_repair_inode( xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); error = xrep_trans_commit(sc); - xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - return error; + goto out_unlock; out_scanlock: mutex_unlock(&xnc->lock); out_trans: xchk_trans_cancel(sc); - xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out_unlock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + if (orphanage_available) { + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } + xchk_iunlock(sc, XFS_IOLOCK_EXCL); return error; } @@ -172,10 +302,10 @@ xrep_nlinks( /* * We need ftype for an accurate count of the number of child * subdirectory links. Child subdirectories with a back link (dotdot - * entry) but no forward link are unfixable, so we cannot repair the - * link count of the parent directory based on the back link count - * alone. Filesystems without ftype support are rare (old V4) so we - * just skip out here. + * entry) but no forward link are moved to the orphanage, so we cannot + * repair the link count of the parent directory based on the back link + * count alone. Filesystems without ftype support are rare (old V4) so + * we just skip out here. */ if (!xfs_has_ftype(sc->mp)) return -EOPNOTSUPP; diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c new file mode 100644 index 000000000000..7148d8362db8 --- /dev/null +++ b/fs/xfs/scrub/orphanage.c @@ -0,0 +1,627 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_dir2.h" +#include "xfs_icache.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_parent.h" +#include "xfs_attr_sf.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/orphanage.h" +#include "scrub/readdir.h" + +#include <linux/namei.h> + +/* + * The Orphanage + * ============= + * + * If the directory tree is damaged, children of that directory become + * inaccessible via that file path. If a child has no other parents, the file + * is said to be orphaned. xfs_repair fixes this situation by creating a + * orphanage directory (specifically, /lost+found) and creating a directory + * entry pointing to the orphaned file. + * + * Online repair follows this tactic by creating a root-owned /lost+found + * directory if one does not exist. If an orphan is found, it will move that + * files into orphanage. + */ + +/* Make the orphanage owned by root. */ +STATIC int +xrep_chown_orphanage( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = sc->mp; + struct xfs_dquot *udqp = NULL, *gdqp = NULL, *pdqp = NULL; + struct xfs_dquot *oldu = NULL, *oldg = NULL, *oldp = NULL; + struct inode *inode = VFS_I(dp); + int error; + + error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp); + if (error) + return error; + + error = xfs_trans_alloc_ichange(dp, udqp, gdqp, pdqp, true, &tp); + if (error) + goto out_dqrele; + + /* + * Always clear setuid/setgid/sticky on the orphanage since we don't + * normally want that functionality on this directory and xfs_repair + * doesn't create it this way either. Leave the other access bits + * unchanged. + */ + inode->i_mode &= ~(S_ISUID | S_ISGID | S_ISVTX); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) { + if (XFS_IS_UQUOTA_ON(mp)) + oldu = xfs_qm_vop_chown(tp, dp, &dp->i_udquot, udqp); + inode->i_uid = GLOBAL_ROOT_UID; + } + if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) { + if (XFS_IS_GQUOTA_ON(mp)) + oldg = xfs_qm_vop_chown(tp, dp, &dp->i_gdquot, gdqp); + inode->i_gid = GLOBAL_ROOT_GID; + } + if (dp->i_projid != 0) { + if (XFS_IS_PQUOTA_ON(mp)) + oldp = xfs_qm_vop_chown(tp, dp, &dp->i_pdquot, pdqp); + dp->i_projid = 0; + } + + dp->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + XFS_STATS_INC(mp, xs_ig_attrchg); + + if (xfs_has_wsync(mp)) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp); + + xfs_qm_dqrele(oldu); + xfs_qm_dqrele(oldg); + xfs_qm_dqrele(oldp); + +out_dqrele: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + return error; +} + +#define ORPHANAGE "lost+found" + +/* Create the orphanage directory, and set sc->orphanage to it. */ +int +xrep_orphanage_create( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct dentry *root_dentry, *orphanage_dentry; + struct inode *root_inode = VFS_I(sc->mp->m_rootip); + struct inode *orphanage_inode; + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + if (xfs_is_readonly(mp)) { + sc->orphanage = NULL; + return 0; + } + + ASSERT(sc->tp == NULL); + ASSERT(sc->orphanage == NULL); + + /* Find the dentry for the root directory... */ + root_dentry = d_find_alias(root_inode); + if (!root_dentry) { + error = -EFSCORRUPTED; + goto out; + } + + /* ...which is a directory, right? */ + if (!d_is_dir(root_dentry)) { + error = -EFSCORRUPTED; + goto out_dput_root; + } + + /* Try to find the orphanage directory. */ + inode_lock_nested(root_inode, I_MUTEX_PARENT); + orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry, + strlen(ORPHANAGE)); + if (IS_ERR(orphanage_dentry)) { + error = PTR_ERR(orphanage_dentry); + goto out_unlock_root; + } + + /* + * Nothing found? Call mkdir to create the orphanage. Create the + * directory without other-user access because we're live and someone + * could have been relying partly on minimal access to a parent + * directory to control access to a file we put in here. + */ + if (d_really_is_negative(orphanage_dentry)) { + error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry, + 0750); + if (error) + goto out_dput_orphanage; + } + + /* Not a directory? Bail out. */ + if (!d_is_dir(orphanage_dentry)) { + error = -ENOTDIR; + goto out_dput_orphanage; + } + + /* + * Grab a reference to the orphanage. This /should/ succeed since + * we hold the root directory locked and therefore nobody can delete + * the orphanage. + */ + orphanage_inode = igrab(d_inode(orphanage_dentry)); + if (!orphanage_inode) { + error = -ENOENT; + goto out_dput_orphanage; + } + + /* Make sure the orphanage is owned by root. */ + error = xrep_chown_orphanage(sc, XFS_I(orphanage_inode)); + if (error) + goto out_dput_orphanage; + + /* Stash the reference for later and bail out. */ + sc->orphanage = XFS_I(orphanage_inode); + sc->orphanage_ilock_flags = 0; + +out_dput_orphanage: + dput(orphanage_dentry); +out_unlock_root: + inode_unlock(VFS_I(sc->mp->m_rootip)); +out_dput_root: + dput(root_dentry); +out: + return error; +} + +void +xrep_orphanage_ilock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + sc->orphanage_ilock_flags |= ilock_flags; + xfs_ilock(sc->orphanage, ilock_flags); +} + +bool +xrep_orphanage_ilock_nowait( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + if (xfs_ilock_nowait(sc->orphanage, ilock_flags)) { + sc->orphanage_ilock_flags |= ilock_flags; + return true; + } + + return false; +} + +void +xrep_orphanage_iunlock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + xfs_iunlock(sc->orphanage, ilock_flags); + sc->orphanage_ilock_flags &= ~ilock_flags; +} + +/* Grab the IOLOCK of the orphanage and sc->ip. */ +int +xrep_orphanage_iolock_two( + struct xfs_scrub *sc) +{ + int error = 0; + + while (true) { + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Normal XFS takes the IOLOCK before grabbing a transaction. + * Scrub holds a transaction, which means that we can't block + * on either IOLOCK. + */ + if (xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + if (xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + break; + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + } + delay(1); + } + + return 0; +} + +/* Release the orphanage. */ +void +xrep_orphanage_rele( + struct xfs_scrub *sc) +{ + if (!sc->orphanage) + return; + + if (sc->orphanage_ilock_flags) + xfs_iunlock(sc->orphanage, sc->orphanage_ilock_flags); + + xchk_irele(sc, sc->orphanage); + sc->orphanage = NULL; +} + +/* Adoption moves a file into /lost+found */ + +/* Can the orphanage adopt @sc->ip? */ +bool +xrep_orphanage_can_adopt( + struct xfs_scrub *sc) +{ + ASSERT(sc->ip != NULL); + + if (!sc->orphanage) + return false; + if (sc->ip == sc->orphanage) + return false; + if (xfs_internal_inum(sc->mp, sc->ip->i_ino)) + return false; + return true; +} + +/* + * Create a new transaction to send a child to the orphanage. + * + * Allocate a new transaction with sufficient disk space to handle the + * adoption, take ILOCK_EXCL of the orphanage and sc->ip, joins them to the + * transaction, and reserve quota to reparent the latter. Caller must hold the + * IOLOCK of the orphanage and sc->ip. + */ +int +xrep_adoption_trans_alloc( + struct xfs_scrub *sc, + struct xrep_adoption *adopt) +{ + struct xfs_mount *mp = sc->mp; + unsigned int child_blkres = 0; + int error; + + ASSERT(sc->tp == NULL); + ASSERT(sc->ip != NULL); + ASSERT(sc->orphanage != NULL); + ASSERT(sc->ilock_flags & XFS_IOLOCK_EXCL); + ASSERT(sc->orphanage_ilock_flags & XFS_IOLOCK_EXCL); + ASSERT(!(sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); + ASSERT(!(sc->orphanage_ilock_flags & + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))); + + /* Compute the worst case space reservation that we need. */ + adopt->sc = sc; + adopt->orphanage_blkres = xfs_link_space_res(mp, MAXNAMELEN); + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + child_blkres = xfs_rename_space_res(mp, 0, false, + xfs_name_dotdot.len, false); + if (xfs_has_parent(mp)) + child_blkres += XFS_ADDAFORK_SPACE_RES(mp); + adopt->child_blkres = child_blkres; + + /* + * Allocate a transaction to link the child into the parent, along with + * enough disk space to handle expansion of both the orphanage and the + * dotdot entry of a child directory. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, + adopt->orphanage_blkres + adopt->child_blkres, 0, 0, + &sc->tp); + if (error) + return error; + + xfs_lock_two_inodes(sc->orphanage, XFS_ILOCK_EXCL, + sc->ip, XFS_ILOCK_EXCL); + sc->ilock_flags |= XFS_ILOCK_EXCL; + sc->orphanage_ilock_flags |= XFS_ILOCK_EXCL; + + xfs_trans_ijoin(sc->tp, sc->orphanage, 0); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* + * Reserve enough quota in the orphan directory to add the new name. + * Normally the orphanage should have user/group/project ids of zero + * and hence is not subject to quota enforcement, but we're allowed to + * exceed quota to reattach disconnected parts of the directory tree. + */ + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->orphanage, + adopt->orphanage_blkres, 0, true); + if (error) + goto out_cancel; + + /* + * Reserve enough quota in the child directory to change dotdot. + * Here we're also allowed to exceed file quota to repair inconsistent + * metadata. + */ + if (adopt->child_blkres) { + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->ip, + adopt->child_blkres, 0, true); + if (error) + goto out_cancel; + } + + return 0; +out_cancel: + xchk_trans_cancel(sc); + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + return error; +} + +/* + * Compute the xfs_name for the directory entry that we're adding to the + * orphanage. Caller must hold ILOCKs of sc->ip and the orphanage and must not + * reuse namebuf until the adoption completes or is dissolved. + */ +int +xrep_adoption_compute_name( + struct xrep_adoption *adopt, + struct xfs_name *xname) +{ + struct xfs_scrub *sc = adopt->sc; + char *namebuf = (void *)xname->name; + xfs_ino_t ino; + unsigned int incr = 0; + int error = 0; + + adopt->xname = xname; + xname->len = snprintf(namebuf, MAXNAMELEN, "%llu", sc->ip->i_ino); + xname->type = xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode); + + /* Make sure the filename is unique in the lost+found. */ + error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino); + while (error == 0 && incr < 10000) { + xname->len = snprintf(namebuf, MAXNAMELEN, "%llu.%u", + sc->ip->i_ino, ++incr); + error = xchk_dir_lookup(sc, sc->orphanage, xname, &ino); + } + if (error == 0) { + /* We already have 10,000 entries in the orphanage? */ + return -EFSCORRUPTED; + } + + if (error != -ENOENT) + return error; + return 0; +} + +/* + * Make sure the dcache does not have a positive dentry for the name we've + * chosen. The caller should have checked with the ondisk directory, so any + * discrepancy is a sign that something is seriously wrong. + */ +static int +xrep_adoption_check_dcache( + struct xrep_adoption *adopt) +{ + struct qstr qname = QSTR_INIT(adopt->xname->name, + adopt->xname->len); + struct xfs_scrub *sc = adopt->sc; + struct dentry *d_orphanage, *d_child; + int error = 0; + + d_orphanage = d_find_alias(VFS_I(sc->orphanage)); + if (!d_orphanage) + return 0; + + d_child = d_hash_and_lookup(d_orphanage, &qname); + if (d_child) { + trace_xrep_adoption_check_child(sc->mp, d_child); + + if (d_is_positive(d_child)) { + ASSERT(d_is_negative(d_child)); + error = -EFSCORRUPTED; + } + + dput(d_child); + } + + dput(d_orphanage); + return error; +} + +/* + * Invalidate all dentries for the name that was added to the orphanage + * directory, and all dentries pointing to the child inode that was moved. + * + * There should not be any positive entries for the name, since we've + * maintained our lock on the orphanage directory. + */ +static void +xrep_adoption_zap_dcache( + struct xrep_adoption *adopt) +{ + struct qstr qname = QSTR_INIT(adopt->xname->name, + adopt->xname->len); + struct xfs_scrub *sc = adopt->sc; + struct dentry *d_orphanage, *d_child; + + /* Invalidate all dentries for the adoption name */ + d_orphanage = d_find_alias(VFS_I(sc->orphanage)); + if (!d_orphanage) + return; + + d_child = d_hash_and_lookup(d_orphanage, &qname); + while (d_child != NULL) { + trace_xrep_adoption_invalidate_child(sc->mp, d_child); + + ASSERT(d_is_negative(d_child)); + d_invalidate(d_child); + dput(d_child); + d_child = d_lookup(d_orphanage, &qname); + } + + dput(d_orphanage); + + /* Invalidate all the dentries pointing down to this file. */ + while ((d_child = d_find_alias(VFS_I(sc->ip))) != NULL) { + trace_xrep_adoption_invalidate_child(sc->mp, d_child); + + d_invalidate(d_child); + dput(d_child); + } +} + +/* + * If we have to add an attr fork ahead of a parent pointer update, how much + * space should we ask for? + */ +static inline int +xrep_adoption_attr_sizeof( + const struct xrep_adoption *adopt) +{ + return sizeof(struct xfs_attr_sf_hdr) + + xfs_attr_sf_entsize_byname(sizeof(struct xfs_parent_rec), + adopt->xname->len); +} + +/* + * Move the current file to the orphanage under the computed name. + * + * Returns with a dirty transaction so that the caller can handle any other + * work, such as fixing up unlinked lists or resetting link counts. + */ +int +xrep_adoption_move( + struct xrep_adoption *adopt) +{ + struct xfs_scrub *sc = adopt->sc; + bool isdir = S_ISDIR(VFS_I(sc->ip)->i_mode); + int error; + + trace_xrep_adoption_reparent(sc->orphanage, adopt->xname, + sc->ip->i_ino); + + error = xrep_adoption_check_dcache(adopt); + if (error) + return error; + + /* + * If this filesystem has parent pointers, ensure that the file being + * moved to the orphanage has an attribute fork. This is required + * because the parent pointer code does not itself add attr forks. + */ + if (!xfs_inode_has_attr_fork(sc->ip) && xfs_has_parent(sc->mp)) { + int sf_size = xrep_adoption_attr_sizeof(adopt); + + error = xfs_bmap_add_attrfork(sc->tp, sc->ip, sf_size, true); + if (error) + return error; + } + + /* Create the new name in the orphanage. */ + error = xfs_dir_createname(sc->tp, sc->orphanage, adopt->xname, + sc->ip->i_ino, adopt->orphanage_blkres); + if (error) + return error; + + /* + * Bump the link count of the orphanage if we just added a + * subdirectory, and update its timestamps. + */ + xfs_trans_ichgtime(sc->tp, sc->orphanage, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + if (isdir) + xfs_bumplink(sc->tp, sc->orphanage); + xfs_trans_log_inode(sc->tp, sc->orphanage, XFS_ILOG_CORE); + + /* Bump the link count of the child. */ + if (adopt->bump_child_nlink) { + xfs_bumplink(sc->tp, sc->ip); + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + } + + /* Replace the dotdot entry if the child is a subdirectory. */ + if (isdir) { + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + sc->orphanage->i_ino, adopt->child_blkres); + if (error) + return error; + } + + /* Add a parent pointer from the file back to the lost+found. */ + if (xfs_has_parent(sc->mp)) { + error = xfs_parent_addname(sc->tp, &adopt->ppargs, + sc->orphanage, adopt->xname, sc->ip); + if (error) + return error; + } + + /* + * Notify dirent hooks that we moved the file to /lost+found, and + * finish all the deferred work so that we know the adoption is fully + * recorded in the log. + */ + xfs_dir_update_hook(sc->orphanage, sc->ip, 1, adopt->xname); + + /* Remove negative dentries from the lost+found's dcache */ + xrep_adoption_zap_dcache(adopt); + return 0; +} + +/* + * Roll to a clean scrub transaction so that we can release the orphanage, + * even if xrep_adoption_move was not called. + * + * Commits all the work and deferred ops attached to an adoption request and + * rolls to a clean scrub transaction. On success, returns 0 with the scrub + * context holding a clean transaction with no inodes joined. On failure, + * returns negative errno with no scrub transaction. All inode locks are + * still held after this function returns. + */ +int +xrep_adoption_trans_roll( + struct xrep_adoption *adopt) +{ + struct xfs_scrub *sc = adopt->sc; + int error; + + trace_xrep_adoption_trans_roll(sc->orphanage, sc->ip, + !!(sc->tp->t_flags & XFS_TRANS_DIRTY)); + + /* Finish all the deferred ops to commit all repairs. */ + error = xrep_defer_finish(sc); + if (error) + return error; + + /* Roll the transaction once more to detach the inodes. */ + return xfs_trans_roll(&sc->tp); +} diff --git a/fs/xfs/scrub/orphanage.h b/fs/xfs/scrub/orphanage.h new file mode 100644 index 000000000000..7c7a2e7d81db --- /dev/null +++ b/fs/xfs/scrub/orphanage.h @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ORPHANAGE_H__ +#define __XFS_SCRUB_ORPHANAGE_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_orphanage_create(struct xfs_scrub *sc); + +/* + * If we're doing a repair, ensure that the orphanage exists and attach it to + * the scrub context. + */ +static inline int +xrep_orphanage_try_create( + struct xfs_scrub *sc) +{ + int error; + + ASSERT(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR); + + error = xrep_orphanage_create(sc); + switch (error) { + case 0: + case -ENOENT: + case -ENOTDIR: + case -ENOSPC: + /* + * If the orphanage can't be found or isn't a directory, we'll + * keep going, but we won't be able to attach the file to the + * orphanage if we can't find the parent. + */ + return 0; + } + + return error; +} + +int xrep_orphanage_iolock_two(struct xfs_scrub *sc); + +void xrep_orphanage_ilock(struct xfs_scrub *sc, unsigned int ilock_flags); +bool xrep_orphanage_ilock_nowait(struct xfs_scrub *sc, + unsigned int ilock_flags); +void xrep_orphanage_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); + +void xrep_orphanage_rele(struct xfs_scrub *sc); + +/* Information about a request to add a file to the orphanage. */ +struct xrep_adoption { + struct xfs_scrub *sc; + + /* Name used for the adoption. */ + struct xfs_name *xname; + + /* Parent pointer context tracking */ + struct xfs_parent_args ppargs; + + /* Block reservations for orphanage and child (if directory). */ + unsigned int orphanage_blkres; + unsigned int child_blkres; + + /* + * Does the caller want us to bump the child link count? This is not + * needed when reattaching files that have become disconnected but have + * nlink > 1. It is necessary when changing the directory tree + * structure. + */ + bool bump_child_nlink:1; +}; + +bool xrep_orphanage_can_adopt(struct xfs_scrub *sc); + +int xrep_adoption_trans_alloc(struct xfs_scrub *sc, + struct xrep_adoption *adopt); +int xrep_adoption_compute_name(struct xrep_adoption *adopt, + struct xfs_name *xname); +int xrep_adoption_move(struct xrep_adoption *adopt); +int xrep_adoption_trans_roll(struct xrep_adoption *adopt); +#else +struct xrep_adoption { /* empty */ }; +# define xrep_orphanage_rele(sc) ((void)0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_ORPHANAGE_H__ */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 7db873672146..733c410a2279 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -10,19 +10,37 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_attr.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/repair.h" +#include "scrub/listxattr.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/trace.h" /* Set us up to scrub parents. */ int xchk_setup_parent( struct xfs_scrub *sc) { + int error; + + if (xchk_could_repair(sc)) { + error = xrep_setup_parent(sc); + if (error) + return error; + } + return xchk_setup_inode_contents(sc, 0); } @@ -143,7 +161,8 @@ xchk_parent_validate( } if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) return error; - if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { + if (dp == sc->ip || xrep_is_tempfile(dp) || + !S_ISDIR(VFS_I(dp)->i_mode)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out_rele; } @@ -185,6 +204,621 @@ out_rele: return error; } +/* + * Checking of Parent Pointers + * =========================== + * + * On filesystems with directory parent pointers, we check the referential + * integrity by visiting each parent pointer of a child file and checking that + * the directory referenced by the pointer actually has a dirent pointing + * forward to the child file. + */ + +/* Deferred parent pointer entry that we saved for later. */ +struct xchk_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; +}; + +struct xchk_pptrs { + struct xfs_scrub *sc; + + /* How many parent pointers did we find at the end? */ + unsigned long long pptrs_found; + + /* Parent of this directory. */ + xfs_ino_t parent_ino; + + /* Fixed-size array of xchk_pptr structures. */ + struct xfarray *pptr_entries; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_da_args pptr_args; + + /* If we've cycled the ILOCK, we must revalidate all deferred pptrs. */ + bool need_revalidate; + + /* Name buffer */ + struct xfs_name xname; + char namebuf[MAXNAMELEN]; +}; + +/* Does this parent pointer match the dotdot entry? */ +STATIC int +xchk_parent_scan_dotdot( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_pptrs *pp = priv; + xfs_ino_t parent_ino; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + if (pp->parent_ino == parent_ino) + return -ECANCELED; + + return 0; +} + +/* Look up the dotdot entry so that we can check it as we walk the pptrs. */ +STATIC int +xchk_parent_pptr_and_dotdot( + struct xchk_pptrs *pp) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + /* Look up '..' */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &pp->parent_ino); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + if (!xfs_verify_dir_ino(sc->mp, pp->parent_ino)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* Is this the root dir? Then '..' must point to itself. */ + if (sc->ip == sc->mp->m_rootip) { + if (sc->ip->i_ino != pp->parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + + /* + * If this is now an unlinked directory, the dotdot value is + * meaningless as long as it points to a valid inode. + */ + if (VFS_I(sc->ip)->i_nlink == 0) + return 0; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Otherwise, walk the pptrs again, and check. */ + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_dotdot, NULL, pp); + if (error == -ECANCELED) { + /* Found a parent pointer that matches dotdot. */ + return 0; + } + if (!error || error == -EFSCORRUPTED) { + /* Found a broken parent pointer or no match. */ + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + return error; +} + +/* + * Try to lock a parent directory for checking dirents. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_parent_lock_dir( + struct xfs_scrub *sc, + struct xfs_inode *dp) +{ + if (!xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) + return 0; + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) { + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return 0; + } + + if (!xfs_need_iread_extents(&dp->i_df)) + return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED; + + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) { + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return 0; + } + + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; +} + +/* Check the forward link (dirent) associated with this parent pointer. */ +STATIC int +xchk_parent_dirent( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = pp->sc; + xfs_ino_t child_ino; + int error; + + /* + * Use the name attached to this parent pointer to look up the + * directory entry in the alleged parent. + */ + error = xchk_dir_lookup(sc, dp, xname, &child_ino); + if (error == -ENOENT) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + return error; + + /* Does the inode number match? */ + if (child_ino != sc->ip->i_ino) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + + return 0; +} + +/* Try to grab a parent directory. */ +STATIC int +xchk_parent_iget( + struct xchk_pptrs *pp, + const struct xfs_parent_rec *pptr, + struct xfs_inode **dpp) +{ + struct xfs_scrub *sc = pp->sc; + struct xfs_inode *ip; + xfs_ino_t parent_ino = be64_to_cpu(pptr->p_ino); + int error; + + /* Validate inode number. */ + error = xfs_dir_ino_validate(sc->mp, parent_ino); + if (error) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + + error = xchk_iget(sc, parent_ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + return error; + + /* The parent must be a directory. */ + if (!S_ISDIR(VFS_I(ip)->i_mode)) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + goto out_rele; + } + + /* Validate generation number. */ + if (VFS_I(ip)->i_generation != be32_to_cpu(pptr->p_gen)) { + xchk_fblock_xref_set_corrupt(sc, XFS_ATTR_FORK, 0); + goto out_rele; + } + + *dpp = ip; + return 0; +out_rele: + xchk_irele(sc, ip); + return 0; +} + +/* + * Walk an xattr of a file. If this xattr is a parent pointer, follow it up + * to a parent directory and check that the parent has a dirent pointing back + * to us. + */ +STATIC int +xchk_parent_scan_attr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xfs_name xname = { + .name = name, + .len = namelen, + }; + struct xchk_pptrs *pp = priv; + struct xfs_inode *dp = NULL; + const struct xfs_parent_rec *pptr_rec = value; + xfs_ino_t parent_ino; + unsigned int lockmode; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return error; + } + + /* No self-referential parent pointers. */ + if (parent_ino == sc->ip->i_ino) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return -ECANCELED; + } + + pp->pptrs_found++; + + error = xchk_parent_iget(pp, pptr_rec, &dp); + if (error) + return error; + if (!dp) + return 0; + + /* Try to lock the inode. */ + lockmode = xchk_parent_lock_dir(sc, dp); + if (!lockmode) { + struct xchk_pptr save_pp = { + .pptr_rec = *pptr_rec, /* struct copy */ + .namelen = namelen, + }; + + /* Couldn't lock the inode, so save the pptr for later. */ + trace_xchk_parent_defer(sc->ip, &xname, dp->i_ino); + + error = xfblob_storename(pp->pptr_names, &save_pp.name_cookie, + &xname); + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + goto out_rele; + + error = xfarray_append(pp->pptr_entries, &save_pp); + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + goto out_rele; + + goto out_rele; + } + + error = xchk_parent_dirent(pp, &xname, dp); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lockmode); +out_rele: + xchk_irele(sc, dp); + return error; +} + +/* + * Revalidate a parent pointer that we collected in the past but couldn't check + * because of lock contention. Returns 0 if the parent pointer is still valid, + * -ENOENT if it has gone away on us, or a negative errno. + */ +STATIC int +xchk_parent_revalidate_pptr( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_parent_rec *pptr) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + error = xfs_parent_lookup(sc->tp, sc->ip, xname, pptr, &pp->pptr_args); + if (error == -ENOATTR) { + /* Parent pointer went away, nothing to revalidate. */ + return -ENOENT; + } + + return error; +} + +/* + * Check a parent pointer the slow way, which means we cycle locks a bunch + * and put up with revalidation until we get it done. + */ +STATIC int +xchk_parent_slow_pptr( + struct xchk_pptrs *pp, + const struct xfs_name *xname, + struct xfs_parent_rec *pptr) +{ + struct xfs_scrub *sc = pp->sc; + struct xfs_inode *dp = NULL; + unsigned int lockmode; + int error; + + /* Check that the deferred parent pointer still exists. */ + if (pp->need_revalidate) { + error = xchk_parent_revalidate_pptr(pp, xname, pptr); + if (error == -ENOENT) + return 0; + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, + &error)) + return error; + } + + error = xchk_parent_iget(pp, pptr, &dp); + if (error) + return error; + if (!dp) + return 0; + + /* + * If we can grab both IOLOCK and ILOCK of the alleged parent, we + * can proceed with the validation. + */ + lockmode = xchk_parent_lock_dir(sc, dp); + if (lockmode) { + trace_xchk_parent_slowpath(sc->ip, xname, dp->i_ino); + goto check_dirent; + } + + /* + * We couldn't lock the parent dir. Drop all the locks and try to + * get them again, one at a time. + */ + xchk_iunlock(sc, sc->ilock_flags); + pp->need_revalidate = true; + + trace_xchk_parent_ultraslowpath(sc->ip, xname, dp->i_ino); + + error = xchk_dir_trylock_for_pptrs(sc, dp, &lockmode); + if (error) + goto out_rele; + + /* Revalidate the parent pointer now that we cycled locks. */ + error = xchk_parent_revalidate_pptr(pp, xname, pptr); + if (error == -ENOENT) { + error = 0; + goto out_unlock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_ATTR_FORK, 0, &error)) + goto out_unlock; + +check_dirent: + error = xchk_parent_dirent(pp, xname, dp); +out_unlock: + xfs_iunlock(dp, lockmode); +out_rele: + xchk_irele(sc, dp); + return error; +} + +/* Check all the parent pointers that we deferred the first time around. */ +STATIC int +xchk_parent_finish_slow_pptrs( + struct xchk_pptrs *pp) +{ + xfarray_idx_t array_cur; + int error; + + foreach_xfarray_idx(pp->pptr_entries, array_cur) { + struct xchk_pptr pptr; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + error = xfarray_load(pp->pptr_entries, array_cur, &pptr); + if (error) + return error; + + error = xfblob_loadname(pp->pptr_names, pptr.name_cookie, + &pp->xname, pptr.namelen); + if (error) + return error; + + error = xchk_parent_slow_pptr(pp, &pp->xname, &pptr.pptr_rec); + if (error) + return error; + } + + /* Empty out both xfiles now that we've checked everything. */ + xfarray_truncate(pp->pptr_entries); + xfblob_truncate(pp->pptr_names); + return 0; +} + +/* Count the number of parent pointers. */ +STATIC int +xchk_parent_count_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xchk_pptrs *pp = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + pp->pptrs_found++; + return 0; +} + +/* + * Compare the number of parent pointers to the link count. For + * non-directories these should be the same. For unlinked directories the + * count should be zero; for linked directories, it should be nonzero. + */ +STATIC int +xchk_parent_count_pptrs( + struct xchk_pptrs *pp) +{ + struct xfs_scrub *sc = pp->sc; + int error; + + /* + * If we cycled the ILOCK while cross-checking parent pointers with + * dirents, then we need to recalculate the number of parent pointers. + */ + if (pp->need_revalidate) { + pp->pptrs_found = 0; + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_count_pptr, + NULL, pp); + if (error == -EFSCORRUPTED) { + /* Found a bad parent pointer */ + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + if (error) + return error; + } + + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + if (sc->ip == sc->mp->m_rootip) + pp->pptrs_found++; + + if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + else if (VFS_I(sc->ip)->i_nlink > 0 && + pp->pptrs_found == 0) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } else { + if (VFS_I(sc->ip)->i_nlink != pp->pptrs_found) + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } + + return 0; +} + +/* Check parent pointers of a file. */ +STATIC int +xchk_parent_pptr( + struct xfs_scrub *sc) +{ + struct xchk_pptrs *pp; + char *descr; + int error; + + pp = kvzalloc(sizeof(struct xchk_pptrs), XCHK_GFP_FLAGS); + if (!pp) + return -ENOMEM; + pp->sc = sc; + pp->xname.name = pp->namebuf; + + /* + * Set up some staging memory for parent pointers that we can't check + * due to locking contention. + */ + descr = xchk_xfile_ino_descr(sc, "slow parent pointer entries"); + error = xfarray_create(descr, 0, sizeof(struct xchk_pptr), + &pp->pptr_entries); + kfree(descr); + if (error) + goto out_pp; + + descr = xchk_xfile_ino_descr(sc, "slow parent pointer names"); + error = xfblob_create(descr, &pp->pptr_names); + kfree(descr); + if (error) + goto out_entries; + + error = xchk_xattr_walk(sc, sc->ip, xchk_parent_scan_attr, NULL, pp); + if (error == -ECANCELED) { + error = 0; + goto out_names; + } + if (error) + goto out_names; + + error = xchk_parent_finish_slow_pptrs(pp); + if (error == -ETIMEDOUT) { + /* Couldn't grab a lock, scrub was marked incomplete */ + error = 0; + goto out_names; + } + if (error) + goto out_names; + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_names; + + /* + * For subdirectories, make sure the dotdot entry references the same + * inode as the parent pointers. + * + * If we're scanning a /consistent/ directory, there should only be + * one parent pointer, and it should point to the same directory as + * the dotdot entry. + * + * However, a corrupt directory tree might feature a subdirectory with + * multiple parents. The directory loop scanner is responsible for + * correcting that kind of problem, so for now we only validate that + * the dotdot entry matches /one/ of the parents. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + error = xchk_parent_pptr_and_dotdot(pp); + if (error) + goto out_names; + } + + if (pp->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_pp; + + /* + * Complain if the number of parent pointers doesn't match the link + * count. This could be a sign of missing parent pointers (or an + * incorrect link count). + */ + error = xchk_parent_count_pptrs(pp); + if (error) + goto out_names; + +out_names: + xfblob_destroy(pp->pptr_names); +out_entries: + xfarray_destroy(pp->pptr_entries); +out_pp: + kvfree(pp); + return error; +} + /* Scrub a parent pointer. */ int xchk_parent( @@ -194,6 +828,9 @@ xchk_parent( xfs_ino_t parent_ino; int error = 0; + if (xfs_has_parent(mp)) + return xchk_parent_pptr(sc); + /* * If we're a directory, check that the '..' link points up to * a directory that has one entry pointing to us. @@ -237,3 +874,64 @@ xchk_parent( return error; } + +/* + * Decide if this file's extended attributes (and therefore its parent + * pointers) have been zapped to satisfy the inode and ifork verifiers. + * Checking and repairing should be postponed until the extended attribute + * structure is fixed. + */ +bool +xchk_pptr_looks_zapped( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_has_parent(mp)); + + /* + * Temporary files that cannot be linked into the directory tree do not + * have attr forks because they cannot ever have parents. + */ + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + return false; + + /* + * Directory tree roots do not have parents, so the expected outcome + * of a parent pointer scan is always the empty set. It's safe to scan + * them even if the attr fork was zapped. + */ + if (ip == mp->m_rootip) + return false; + + /* + * Metadata inodes are all rooted in the superblock and do not have + * any parents. Hence the attr fork will not be initialized, but + * there are no parent pointers that might have been zapped. + */ + if (xfs_is_metadata_inode(ip)) + return false; + + /* + * Linked and linkable non-rootdir files should always have an + * attribute fork because that is where parent pointers are + * stored. If the fork is absent, something is amiss. + */ + if (!xfs_inode_has_attr_fork(ip)) + return true; + + /* Repair zapped this file's attr fork a short time ago */ + if (xfs_ifork_zapped(ip, XFS_ATTR_FORK)) + return true; + + /* + * If the dinode repair found a bad attr fork, it will reset the fork + * to extents format with zero records and wait for the bmapbta + * scrubber to reconstruct the block mappings. The extended attribute + * structure always contain some content when parent pointers are + * enabled, so this is a clear sign of a zapped attr fork. + */ + return ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0; +} diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c new file mode 100644 index 000000000000..7b42b7f65a0b --- /dev/null +++ b/fs/xfs/scrub/parent_repair.c @@ -0,0 +1,1612 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_bmap_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_trans_space.h" +#include "xfs_health.h" +#include "xfs_exchmaps.h" +#include "xfs_parent.h" +#include "xfs_attr.h" +#include "xfs_bmap.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/findparent.h" +#include "scrub/readdir.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/orphanage.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" +#include "scrub/attr_repair.h" +#include "scrub/listxattr.h" + +/* + * Repairing The Directory Parent Pointer + * ====================================== + * + * Currently, only directories support parent pointers (in the form of '..' + * entries), so we simply scan the filesystem and update the '..' entry. + * + * Note that because the only parent pointer is the dotdot entry, we won't + * touch an unhealthy directory, since the directory repair code is perfectly + * capable of rebuilding a directory with the proper parent inode. + * + * See the section on locking issues in dir_repair.c for more information about + * conflicts with the VFS. The findparent code wll keep our incore parent + * inode up to date. + * + * If parent pointers are enabled, we instead reconstruct the parent pointer + * information by visiting every directory entry of every directory in the + * system and translating the relevant dirents into parent pointers. In this + * case, it is advantageous to stash all parent pointers created from dirents + * from a single parent file before replaying them into the temporary file. To + * save memory, the live filesystem scan reuses the findparent object. Parent + * pointer repair chooses either directory scanning or findparent, but not + * both. + * + * When salvaging completes, the remaining stashed entries are replayed to the + * temporary file. All non-parent pointer extended attributes are copied to + * the temporary file's extended attributes. An atomic file mapping exchange + * is used to commit the new xattr blocks to the file being repaired. This + * will disrupt attrmulti cursors. + */ + +/* Create a parent pointer in the tempfile. */ +#define XREP_PPTR_ADD (1) + +/* Remove a parent pointer from the tempfile. */ +#define XREP_PPTR_REMOVE (2) + +/* A stashed parent pointer update. */ +struct xrep_pptr { + /* Cookie for retrieval of the pptr name. */ + xfblob_cookie name_cookie; + + /* Parent pointer record. */ + struct xfs_parent_rec pptr_rec; + + /* Length of the pptr name. */ + uint8_t namelen; + + /* XREP_PPTR_{ADD,REMOVE} */ + uint8_t action; +}; + +/* + * Stash up to 8 pages of recovered parent pointers in pptr_recs and + * pptr_names before we write them to the temp file. + */ +#define XREP_PARENT_MAX_STASH_BYTES (PAGE_SIZE * 8) + +struct xrep_parent { + struct xfs_scrub *sc; + + /* Fixed-size array of xrep_pptr structures. */ + struct xfarray *pptr_recs; + + /* Blobs containing parent pointer names. */ + struct xfblob *pptr_names; + + /* xattr keys */ + struct xfarray *xattr_records; + + /* xattr values */ + struct xfblob *xattr_blobs; + + /* Scratch buffers for saving extended attributes */ + unsigned char *xattr_name; + void *xattr_value; + unsigned int xattr_value_sz; + + /* + * Information used to exchange the attr fork mappings, if the fs + * supports parent pointers. + */ + struct xrep_tempexch tx; + + /* + * Information used to scan the filesystem to find the inumber of the + * dotdot entry for this directory. On filesystems without parent + * pointers, we use the findparent_* functions on this object and + * access only the parent_ino field directly. + * + * When parent pointers are enabled, the directory entry scanner uses + * the iscan, hooks, and lock fields of this object directly. + * @pscan.lock coordinates access to pptr_recs, pptr_names, pptr, and + * pptr_scratch. This reduces the memory requirements of this + * structure. + * + * The lock also controls access to xattr_records and xattr_blobs(?) + */ + struct xrep_parent_scan_info pscan; + + /* Orphanage reparenting request. */ + struct xrep_adoption adoption; + + /* Directory entry name, plus the trailing null. */ + struct xfs_name xname; + unsigned char namebuf[MAXNAMELEN]; + + /* Scratch buffer for scanning pptr xattrs */ + struct xfs_da_args pptr_args; + + /* Have we seen any live updates of parent pointers recently? */ + bool saw_pptr_updates; + + /* Number of parents we found after all other repairs */ + unsigned long long parents; +}; + +struct xrep_parent_xattr { + /* Cookie for retrieval of the xattr name. */ + xfblob_cookie name_cookie; + + /* Cookie for retrieval of the xattr value. */ + xfblob_cookie value_cookie; + + /* XFS_ATTR_* flags */ + int flags; + + /* Length of the value and name. */ + uint32_t valuelen; + uint16_t namelen; +}; + +/* + * Stash up to 8 pages of attrs in xattr_records/xattr_blobs before we write + * them to the temp file. + */ +#define XREP_PARENT_XATTR_MAX_STASH_BYTES (PAGE_SIZE * 8) + +/* Tear down all the incore stuff we created. */ +static void +xrep_parent_teardown( + struct xrep_parent *rp) +{ + xrep_findparent_scan_teardown(&rp->pscan); + kvfree(rp->xattr_name); + rp->xattr_name = NULL; + kvfree(rp->xattr_value); + rp->xattr_value = NULL; + if (rp->xattr_blobs) + xfblob_destroy(rp->xattr_blobs); + rp->xattr_blobs = NULL; + if (rp->xattr_records) + xfarray_destroy(rp->xattr_records); + rp->xattr_records = NULL; + if (rp->pptr_names) + xfblob_destroy(rp->pptr_names); + rp->pptr_names = NULL; + if (rp->pptr_recs) + xfarray_destroy(rp->pptr_recs); + rp->pptr_recs = NULL; +} + +/* Set up for a parent repair. */ +int +xrep_setup_parent( + struct xfs_scrub *sc) +{ + struct xrep_parent *rp; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + rp = kvzalloc(sizeof(struct xrep_parent), XCHK_GFP_FLAGS); + if (!rp) + return -ENOMEM; + rp->sc = sc; + rp->xname.name = rp->namebuf; + sc->buf = rp; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + return xrep_orphanage_try_create(sc); +} + +/* + * Scan all files in the filesystem for a child dirent that we can turn into + * the dotdot entry for this directory. + */ +STATIC int +xrep_parent_find_dotdot( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t ino; + unsigned int sick, checked; + int error; + + /* + * Avoid sick directories. There shouldn't be anyone else clearing the + * directory's sick status. + */ + xfs_inode_measure_sickness(sc->ip, &sick, &checked); + if (sick & XFS_SICK_INO_DIR) + return -EFSCORRUPTED; + + ino = xrep_findparent_self_reference(sc); + if (ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rp->pscan, ino); + return 0; + } + + /* + * Drop the ILOCK on this directory so that we can scan for the dotdot + * entry. Figure out who is going to be the parent of this directory, + * then retake the ILOCK so that we can salvage directory entries. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* Does the VFS dcache have an answer for us? */ + ino = xrep_findparent_from_dcache(sc); + if (ino != NULLFSINO) { + error = xrep_findparent_confirm(sc, &ino); + if (!error && ino != NULLFSINO) { + xrep_findparent_scan_finish_early(&rp->pscan, ino); + goto out_relock; + } + } + + /* Scan the entire filesystem for a parent. */ + error = xrep_findparent_scan(&rp->pscan); +out_relock: + xchk_ilock(sc, XFS_ILOCK_EXCL); + + return error; +} + +/* + * Add this stashed incore parent pointer to the temporary file. + * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and + * must not be in transaction context. + */ +STATIC int +xrep_parent_replay_update( + struct xrep_parent *rp, + const struct xfs_name *xname, + struct xrep_pptr *pptr) +{ + struct xfs_scrub *sc = rp->sc; + + switch (pptr->action) { + case XREP_PPTR_ADD: + /* Create parent pointer. */ + trace_xrep_parent_replay_parentadd(sc->tempip, xname, + &pptr->pptr_rec); + + return xfs_parent_set(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rp->pptr_args); + case XREP_PPTR_REMOVE: + /* Remove parent pointer. */ + trace_xrep_parent_replay_parentremove(sc->tempip, xname, + &pptr->pptr_rec); + + return xfs_parent_unset(sc->tempip, sc->ip->i_ino, xname, + &pptr->pptr_rec, &rp->pptr_args); + } + + ASSERT(0); + return -EIO; +} + +/* + * Flush stashed parent pointer updates that have been recorded by the scanner. + * This is done to reduce the memory requirements of the parent pointer + * rebuild, since files can have a lot of hardlinks and the fs can be busy. + * + * Caller must not hold transactions or ILOCKs. Caller must hold the tempfile + * IOLOCK. + */ +STATIC int +xrep_parent_replay_updates( + struct xrep_parent *rp) +{ + xfarray_idx_t array_cur; + int error; + + mutex_lock(&rp->pscan.lock); + foreach_xfarray_idx(rp->pptr_recs, array_cur) { + struct xrep_pptr pptr; + + error = xfarray_load(rp->pptr_recs, array_cur, &pptr); + if (error) + goto out_unlock; + + error = xfblob_loadname(rp->pptr_names, pptr.name_cookie, + &rp->xname, pptr.namelen); + if (error) + goto out_unlock; + rp->xname.len = pptr.namelen; + mutex_unlock(&rp->pscan.lock); + + error = xrep_parent_replay_update(rp, &rp->xname, &pptr); + if (error) + return error; + + mutex_lock(&rp->pscan.lock); + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rp->pptr_recs); + xfblob_truncate(rp->pptr_names); + mutex_unlock(&rp->pscan.lock); + return 0; +out_unlock: + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* + * Remember that we want to create a parent pointer in the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_parent_stash_parentadd( + struct xrep_parent *rp, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_pptr pptr = { + .action = XREP_PPTR_ADD, + .namelen = name->len, + }; + int error; + + trace_xrep_parent_stash_parentadd(rp->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rp->pptr_recs, &pptr); +} + +/* + * Remember that we want to remove a parent pointer from the tempfile. These + * stashed actions will be replayed later. + */ +STATIC int +xrep_parent_stash_parentremove( + struct xrep_parent *rp, + const struct xfs_name *name, + const struct xfs_inode *dp) +{ + struct xrep_pptr pptr = { + .action = XREP_PPTR_REMOVE, + .namelen = name->len, + }; + int error; + + trace_xrep_parent_stash_parentremove(rp->sc->tempip, dp, name); + + xfs_inode_to_parent_rec(&pptr.pptr_rec, dp); + error = xfblob_storename(rp->pptr_names, &pptr.name_cookie, name); + if (error) + return error; + + return xfarray_append(rp->pptr_recs, &pptr); +} + +/* + * Examine an entry of a directory. If this dirent leads us back to the file + * whose parent pointers we're rebuilding, add a pptr to the temporary + * directory. + */ +STATIC int +xrep_parent_scan_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + /* Dirent doesn't point to this directory. */ + if (ino != rp->sc->ip->i_ino) + return 0; + + /* No weird looking names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* No mismatching ftypes. */ + if (name->type != xfs_mode_to_ftype(VFS_I(sc->ip)->i_mode)) + return -EFSCORRUPTED; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + /* + * Transform this dirent into a parent pointer and queue it for later + * addition to the temporary file. + */ + mutex_lock(&rp->pscan.lock); + error = xrep_parent_stash_parentadd(rp, name, dp); + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* + * Decide if we want to look for dirents in this directory. Skip the file + * being repaired and any files being used to stage repairs. + */ +static inline bool +xrep_parent_want_scan( + struct xrep_parent *rp, + const struct xfs_inode *ip) +{ + return ip != rp->sc->ip && !xrep_is_tempfile(ip); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt. + * Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_parent_scan_ilock( + struct xrep_parent *rp, + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* Still need to take the shared ILOCK to advance the iscan cursor. */ + if (!xrep_parent_want_scan(rp, ip)) + goto lock; + + if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* + * Scan this file for relevant child dirents that point to the file whose + * parent pointers we're rebuilding. + */ +STATIC int +xrep_parent_scan_file( + struct xrep_parent *rp, + struct xfs_inode *ip) +{ + unsigned int lock_mode; + int error = 0; + + lock_mode = xrep_parent_scan_ilock(rp, ip); + + if (!xrep_parent_want_scan(rp, ip)) + goto scan_done; + + if (S_ISDIR(VFS_I(ip)->i_mode)) { + /* + * If the directory looks as though it has been zapped by the + * inode record repair code, we cannot scan for child dirents. + */ + if (xchk_dir_looks_zapped(ip)) { + error = -EBUSY; + goto scan_done; + } + + error = xchk_dir_walk(rp->sc, ip, xrep_parent_scan_dirent, rp); + if (error) + goto scan_done; + } + +scan_done: + xchk_iscan_mark_visited(&rp->pscan.iscan, ip); + xfs_iunlock(ip, lock_mode); + return error; +} + +/* Decide if we've stashed too much pptr data in memory. */ +static inline bool +xrep_parent_want_flush_stashed( + struct xrep_parent *rp) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rp->pptr_recs) + xfblob_bytes(rp->pptr_names); + return bytes > XREP_PARENT_MAX_STASH_BYTES; +} + +/* + * Scan all directories in the filesystem to look for dirents that we can turn + * into parent pointers. + */ +STATIC int +xrep_parent_scan_dirtree( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_inode *ip; + int error; + + /* + * Filesystem scans are time consuming. Drop the file ILOCK and all + * other resources for the duration of the scan and hope for the best. + * The live update hooks will keep our scan information up to date. + */ + xchk_trans_cancel(sc); + if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) + xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | + XFS_ILOCK_EXCL)); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) { + bool flush; + + error = xrep_parent_scan_file(rp, ip); + xchk_irele(sc, ip); + if (error) + break; + + /* Flush stashed pptr updates to constrain memory usage. */ + mutex_lock(&rp->pscan.lock); + flush = xrep_parent_want_flush_stashed(rp); + mutex_unlock(&rp->pscan.lock); + if (flush) { + xchk_trans_cancel(sc); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + break; + + error = xrep_parent_replay_updates(rp); + xrep_tempfile_iounlock(sc); + if (error) + break; + + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rp->pscan.iscan); + if (error) { + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Retake sc->ip's ILOCK now that we're done flushing stashed parent + * pointers. We end this function with an empty transaction and the + * ILOCK. + */ + xchk_ilock(rp->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Capture dirent updates being made by other threads which are relevant to the + * file being repaired. + */ +STATIC int +xrep_parent_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xrep_parent *rp; + struct xfs_scrub *sc; + int error; + + rp = container_of(nb, struct xrep_parent, pscan.dhook.dirent_hook.nb); + sc = rp->sc; + + /* + * This thread updated a dirent that points to the file that we're + * repairing, so stash the update for replay against the temporary + * file. + */ + if (p->ip->i_ino == sc->ip->i_ino && + xchk_iscan_want_live_update(&rp->pscan.iscan, p->dp->i_ino)) { + mutex_lock(&rp->pscan.lock); + if (p->delta > 0) + error = xrep_parent_stash_parentadd(rp, p->name, p->dp); + else + error = xrep_parent_stash_parentremove(rp, p->name, + p->dp); + if (!error) + rp->saw_pptr_updates = true; + mutex_unlock(&rp->pscan.lock); + if (error) + goto out_abort; + } + + return NOTIFY_DONE; +out_abort: + xchk_iscan_abort(&rp->pscan.iscan); + return NOTIFY_DONE; +} + +/* Reset a directory's dotdot entry, if needed. */ +STATIC int +xrep_parent_reset_dotdot( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t ino; + unsigned int spaceres; + int error = 0; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &ino); + if (error || ino == rp->pscan.parent_ino) + return error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + trace_xrep_parent_reset_dotdot(sc->ip, rp->pscan.parent_ino); + + /* + * Reserve more space just in case we have to expand the dir. We're + * allowed to exceed quota to repair inconsistent metadata. + */ + spaceres = xfs_rename_space_res(sc->mp, 0, false, xfs_name_dotdot.len, + false); + error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, spaceres, 0, + true); + if (error) + return error; + + error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot, + rp->pscan.parent_ino, spaceres); + if (error) + return error; + + /* + * Roll transaction to detach the inode from the transaction but retain + * ILOCK_EXCL. + */ + return xfs_trans_roll(&sc->tp); +} + +/* Pass back the parent inumber if this a parent pointer */ +STATIC int +xrep_parent_lookup_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + xfs_ino_t *inop = priv; + xfs_ino_t parent_ino; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, &parent_ino, NULL); + if (error) + return error; + + *inop = parent_ino; + return -ECANCELED; +} + +/* + * Find the first parent of the scrub target by walking parent pointers for + * the purpose of deciding if we're going to move it to the orphanage. + * We don't care if the attr fork is zapped. + */ +STATIC int +xrep_parent_lookup_pptrs( + struct xfs_scrub *sc, + xfs_ino_t *inop) +{ + int error; + + *inop = NULLFSINO; + + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_lookup_pptr, NULL, + inop); + if (error && error != -ECANCELED) + return error; + return 0; +} + +/* + * Move the current file to the orphanage. + * + * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon + * successful return, the scrub transaction will have enough extra reservation + * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the + * orphanage; and both inodes will be ijoined. + */ +STATIC int +xrep_parent_move_to_orphanage( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t orig_parent, new_parent; + int error; + + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { + /* + * We are about to drop the ILOCK on sc->ip to lock the + * orphanage and prepare for the adoption. Therefore, look up + * the old dotdot entry for sc->ip so that we can compare it + * after we re-lock sc->ip. + */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &orig_parent); + if (error) + return error; + } else { + /* + * We haven't dropped the ILOCK since we committed the new + * xattr structure (and hence the new parent pointer records), + * which means that the file cannot have been moved in the + * directory tree, and there are no parents. + */ + orig_parent = NULLFSINO; + } + + /* + * Drop the ILOCK on the scrub target and commit the transaction. + * Adoption computes its own resource requirements and gathers the + * necessary components. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + /* If we can take the orphanage's iolock then we're ready to move. */ + if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { + xchk_iunlock(sc, sc->ilock_flags); + error = xrep_orphanage_iolock_two(sc); + if (error) + return error; + } + + /* Grab transaction and ILOCK the two files. */ + error = xrep_adoption_trans_alloc(sc, &rp->adoption); + if (error) + return error; + + error = xrep_adoption_compute_name(&rp->adoption, &rp->xname); + if (error) + return error; + + /* + * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot + * entry again. If the parent changed or the child was unlinked while + * the child directory was unlocked, we don't need to move the child to + * the orphanage after all. For a non-directory, we have to scan for + * the first parent pointer to see if one has been added. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &new_parent); + else + error = xrep_parent_lookup_pptrs(sc, &new_parent); + if (error) + return error; + + /* + * Attach to the orphanage if we still have a linked directory and it + * hasn't been moved. + */ + if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { + error = xrep_adoption_move(&rp->adoption); + if (error) + return error; + } + + /* + * Launder the scrub transaction so we can drop the orphanage ILOCK + * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. + */ + error = xrep_adoption_trans_roll(&rp->adoption); + if (error) + return error; + + xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); + xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* Ensure that the xattr value buffer is large enough. */ +STATIC int +xrep_parent_alloc_xattr_value( + struct xrep_parent *rp, + size_t bufsize) +{ + void *new_val; + + if (rp->xattr_value_sz >= bufsize) + return 0; + + if (rp->xattr_value) { + kvfree(rp->xattr_value); + rp->xattr_value = NULL; + rp->xattr_value_sz = 0; + } + + new_val = kvmalloc(bufsize, XCHK_GFP_FLAGS); + if (!new_val) + return -ENOMEM; + + rp->xattr_value = new_val; + rp->xattr_value_sz = bufsize; + return 0; +} + +/* Retrieve the (remote) value of a non-pptr xattr. */ +STATIC int +xrep_parent_fetch_xattr_remote( + struct xrep_parent *rp, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + unsigned int valuelen) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_da_args args = { + .attr_filter = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .dp = ip, + .name = name, + .namelen = namelen, + .trans = sc->tp, + .valuelen = valuelen, + .owner = ip->i_ino, + }; + int error; + + /* + * If we need a larger value buffer, try to allocate one. If that + * fails, return with -EDEADLOCK to try harder. + */ + error = xrep_parent_alloc_xattr_value(rp, valuelen); + if (error == -ENOMEM) + return -EDEADLOCK; + if (error) + return error; + + args.value = rp->xattr_value; + xfs_attr_sethash(&args); + return xfs_attr_get_ilocked(&args); +} + +/* Stash non-pptr attributes for later replay into the temporary file. */ +STATIC int +xrep_parent_stash_xattr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xrep_parent_xattr key = { + .valuelen = valuelen, + .namelen = namelen, + .flags = attr_flags & XFS_ATTR_NSP_ONDISK_MASK, + }; + struct xrep_parent *rp = priv; + int error; + + if (attr_flags & (XFS_ATTR_INCOMPLETE | XFS_ATTR_PARENT)) + return 0; + + if (!value) { + error = xrep_parent_fetch_xattr_remote(rp, ip, attr_flags, + name, namelen, valuelen); + if (error) + return error; + + value = rp->xattr_value; + } + + trace_xrep_parent_stash_xattr(rp->sc->tempip, key.flags, (void *)name, + key.namelen, key.valuelen); + + error = xfblob_store(rp->xattr_blobs, &key.name_cookie, name, + key.namelen); + if (error) + return error; + + error = xfblob_store(rp->xattr_blobs, &key.value_cookie, value, + key.valuelen); + if (error) + return error; + + return xfarray_append(rp->xattr_records, &key); +} + +/* Insert one xattr key/value. */ +STATIC int +xrep_parent_insert_xattr( + struct xrep_parent *rp, + const struct xrep_parent_xattr *key) +{ + struct xfs_da_args args = { + .dp = rp->sc->tempip, + .attr_filter = key->flags, + .namelen = key->namelen, + .valuelen = key->valuelen, + .owner = rp->sc->ip->i_ino, + .geo = rp->sc->mp->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + int error; + + ASSERT(!(key->flags & XFS_ATTR_PARENT)); + + /* + * Grab pointers to the scrub buffer so that we can use them to insert + * attrs into the temp file. + */ + args.name = rp->xattr_name; + args.value = rp->xattr_value; + + /* + * The attribute name is stored near the end of the in-core buffer, + * though we reserve one more byte to ensure null termination. + */ + rp->xattr_name[XATTR_NAME_MAX] = 0; + + error = xfblob_load(rp->xattr_blobs, key->name_cookie, rp->xattr_name, + key->namelen); + if (error) + return error; + + error = xfblob_free(rp->xattr_blobs, key->name_cookie); + if (error) + return error; + + error = xfblob_load(rp->xattr_blobs, key->value_cookie, args.value, + key->valuelen); + if (error) + return error; + + error = xfblob_free(rp->xattr_blobs, key->value_cookie); + if (error) + return error; + + rp->xattr_name[key->namelen] = 0; + + trace_xrep_parent_insert_xattr(rp->sc->tempip, key->flags, + rp->xattr_name, key->namelen, key->valuelen); + + xfs_attr_sethash(&args); + return xfs_attr_set(&args, XFS_ATTRUPDATE_UPSERT, false); +} + +/* + * Periodically flush salvaged attributes to the temporary file. This is done + * to reduce the memory requirements of the xattr rebuild because files can + * contain millions of attributes. + */ +STATIC int +xrep_parent_flush_xattrs( + struct xrep_parent *rp) +{ + xfarray_idx_t array_cur; + int error; + + /* + * Entering this function, the scrub context has a reference to the + * inode being repaired, the temporary file, and the empty scrub + * transaction that we created for the xattr scan. We hold ILOCK_EXCL + * on the inode being repaired. + * + * To constrain kernel memory use, we occasionally flush salvaged + * xattrs from the xfarray and xfblob structures into the temporary + * file in preparation for exchanging the xattr structures at the end. + * Updating the temporary file requires a transaction, so we commit the + * scrub transaction and drop the ILOCK so that xfs_attr_set can + * allocate whatever transaction it wants. + * + * We still hold IOLOCK_EXCL on the inode being repaired, which + * prevents anyone from adding xattrs (or parent pointers) while we're + * flushing. + */ + xchk_trans_cancel(rp->sc); + xchk_iunlock(rp->sc, XFS_ILOCK_EXCL); + + /* + * Take the IOLOCK of the temporary file while we modify xattrs. This + * isn't strictly required because the temporary file is never revealed + * to userspace, but we follow the same locking rules. We still hold + * sc->ip's IOLOCK. + */ + error = xrep_tempfile_iolock_polled(rp->sc); + if (error) + return error; + + /* Add all the salvaged attrs to the temporary file. */ + foreach_xfarray_idx(rp->xattr_records, array_cur) { + struct xrep_parent_xattr key; + + error = xfarray_load(rp->xattr_records, array_cur, &key); + if (error) + return error; + + error = xrep_parent_insert_xattr(rp, &key); + if (error) + return error; + } + + /* Empty out both arrays now that we've added the entries. */ + xfarray_truncate(rp->xattr_records); + xfblob_truncate(rp->xattr_blobs); + + xrep_tempfile_iounlock(rp->sc); + + /* Recreate the empty transaction and relock the inode. */ + error = xchk_trans_alloc_empty(rp->sc); + if (error) + return error; + xchk_ilock(rp->sc, XFS_ILOCK_EXCL); + return 0; +} + +/* Decide if we've stashed too much xattr data in memory. */ +static inline bool +xrep_parent_want_flush_xattrs( + struct xrep_parent *rp) +{ + unsigned long long bytes; + + bytes = xfarray_bytes(rp->xattr_records) + + xfblob_bytes(rp->xattr_blobs); + return bytes > XREP_PARENT_XATTR_MAX_STASH_BYTES; +} + +/* Flush staged attributes to the temporary file if we're over the limit. */ +STATIC int +xrep_parent_try_flush_xattrs( + struct xfs_scrub *sc, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + if (!xrep_parent_want_flush_xattrs(rp)) + return 0; + + error = xrep_parent_flush_xattrs(rp); + if (error) + return error; + + /* + * If there were any parent pointer updates to the xattr structure + * while we dropped the ILOCK, the xattr structure is now stale. + * Signal to the attr copy process that we need to start over, but + * this time without opportunistic attr flushing. + * + * This is unlikely to happen, so we're ok with restarting the copy. + */ + mutex_lock(&rp->pscan.lock); + if (rp->saw_pptr_updates) + error = -ESTALE; + mutex_unlock(&rp->pscan.lock); + return error; +} + +/* Copy all the non-pptr extended attributes into the temporary file. */ +STATIC int +xrep_parent_copy_xattrs( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + /* + * Clear the pptr updates flag. We hold sc->ip ILOCKed, so there + * can't be any parent pointer updates in progress. + */ + mutex_lock(&rp->pscan.lock); + rp->saw_pptr_updates = false; + mutex_unlock(&rp->pscan.lock); + + /* Copy xattrs, stopping periodically to flush the incore buffers. */ + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr, + xrep_parent_try_flush_xattrs, rp); + if (error && error != -ESTALE) + return error; + + if (error == -ESTALE) { + /* + * The xattr copy collided with a parent pointer update. + * Restart the copy, but this time hold the ILOCK all the way + * to the end to lock out any directory parent pointer updates. + */ + error = xchk_xattr_walk(sc, sc->ip, xrep_parent_stash_xattr, + NULL, rp); + if (error) + return error; + } + + /* Flush any remaining stashed xattrs to the temporary file. */ + if (xfarray_bytes(rp->xattr_records) == 0) + return 0; + + return xrep_parent_flush_xattrs(rp); +} + +/* + * Ensure that @sc->ip and @sc->tempip both have attribute forks before we head + * into the attr fork exchange transaction. All files on a filesystem with + * parent pointers must have an attr fork because the parent pointer code does + * not itself add attribute forks. + * + * Note: Unlinkable unlinked files don't need one, but the overhead of having + * an unnecessary attr fork is not justified by the additional code complexity + * that would be needed to track that state correctly. + */ +STATIC int +xrep_parent_ensure_attr_fork( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + error = xfs_attr_add_fork(sc->tempip, + sizeof(struct xfs_attr_sf_hdr), 1); + if (error) + return error; + return xfs_attr_add_fork(sc->ip, sizeof(struct xfs_attr_sf_hdr), 1); +} + +/* + * Finish replaying stashed parent pointer updates, allocate a transaction for + * exchanging extent mappings, and take the ILOCKs of both files before we + * commit the new attribute structure. + */ +STATIC int +xrep_parent_finalize_tempfile( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + int error; + + /* + * Repair relies on the ILOCK to quiesce all possible xattr updates. + * Replay all queued parent pointer updates into the tempfile before + * exchanging the contents, even if that means dropping the ILOCKs and + * the transaction. + */ + do { + error = xrep_parent_replay_updates(rp); + if (error) + return error; + + error = xrep_parent_ensure_attr_fork(rp); + if (error) + return error; + + error = xrep_tempexch_trans_alloc(sc, XFS_ATTR_FORK, &rp->tx); + if (error) + return error; + + if (xfarray_length(rp->pptr_recs) == 0) + break; + + xchk_trans_cancel(sc); + xrep_tempfile_iunlock_both(sc); + } while (!xchk_should_terminate(sc, &error)); + return error; +} + +/* + * Replay all the stashed parent pointers into the temporary file, copy all + * the non-pptr xattrs from the file being repaired into the temporary file, + * and exchange the attr fork contents atomically. + */ +STATIC int +xrep_parent_rebuild_pptrs( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + xfs_ino_t parent_ino = NULLFSINO; + int error; + + /* + * Copy non-ppttr xattrs from the file being repaired into the + * temporary file's xattr structure. We hold sc->ip's IOLOCK, which + * prevents setxattr/removexattr calls from occurring, but renames + * update the parent pointers without holding IOLOCK. If we detect + * stale attr structures, we restart the scan but only flush at the + * end. + */ + error = xrep_parent_copy_xattrs(rp); + if (error) + return error; + + /* + * Cancel the empty transaction that we used to walk and copy attrs, + * and drop the ILOCK so that we can take the IOLOCK on the temporary + * file. We still hold sc->ip's IOLOCK. + */ + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + error = xrep_tempfile_iolock_polled(sc); + if (error) + return error; + + /* + * Allocate transaction, lock inodes, and make sure that we've replayed + * all the stashed pptr updates to the tempdir. After this point, + * we're ready to exchange the attr fork mappings. + */ + error = xrep_parent_finalize_tempfile(rp); + if (error) + return error; + + /* Last chance to abort before we start committing pptr fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + if (xchk_iscan_aborted(&rp->pscan.iscan)) + return -ECANCELED; + + /* + * Exchange the attr fork contents and junk the old attr fork contents, + * which are now in the tempfile. + */ + error = xrep_xattr_swap(sc, &rp->tx); + if (error) + return error; + error = xrep_xattr_reset_tempfile_fork(sc); + if (error) + return error; + + /* + * Roll to get a transaction without any inodes joined to it. Then we + * can drop the tempfile's ILOCK and IOLOCK before doing more work on + * the scrub target file. + */ + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + xrep_tempfile_iunlock(sc); + xrep_tempfile_iounlock(sc); + + /* + * We've committed the new parent pointers. Find at least one parent + * so that we can decide if we're moving this file to the orphanage. + * For this purpose, root directories are their own parents. + */ + if (sc->ip == sc->mp->m_rootip) { + xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino); + } else { + error = xrep_parent_lookup_pptrs(sc, &parent_ino); + if (error) + return error; + if (parent_ino != NULLFSINO) + xrep_findparent_scan_found(&rp->pscan, parent_ino); + } + return 0; +} + +/* + * Commit the new parent pointer structure (currently only the dotdot entry) to + * the file that we're repairing. + */ +STATIC int +xrep_parent_rebuild_tree( + struct xrep_parent *rp) +{ + int error; + + if (xfs_has_parent(rp->sc->mp)) { + error = xrep_parent_rebuild_pptrs(rp); + if (error) + return error; + } + + if (rp->pscan.parent_ino == NULLFSINO) { + if (xrep_orphanage_can_adopt(rp->sc)) + return xrep_parent_move_to_orphanage(rp); + return -EFSCORRUPTED; + } + + if (S_ISDIR(VFS_I(rp->sc->ip)->i_mode)) + return xrep_parent_reset_dotdot(rp); + + return 0; +} + +/* Count the number of parent pointers. */ +STATIC int +xrep_parent_count_pptr( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int attr_flags, + const unsigned char *name, + unsigned int namelen, + const void *value, + unsigned int valuelen, + void *priv) +{ + struct xrep_parent *rp = priv; + int error; + + if (!(attr_flags & XFS_ATTR_PARENT)) + return 0; + + error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, + valuelen, NULL, NULL); + if (error) + return error; + + rp->parents++; + return 0; +} + +/* + * After all parent pointer rebuilding and adoption activity completes, reset + * the link count of this nondirectory, having scanned the fs to rebuild all + * parent pointers. + */ +STATIC int +xrep_parent_set_nondir_nlink( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + struct xfs_inode *ip = sc->ip; + struct xfs_perag *pag; + bool joined = false; + int error; + + /* Count parent pointers so we can reset the file link count. */ + rp->parents = 0; + error = xchk_xattr_walk(sc, ip, xrep_parent_count_pptr, NULL, rp); + if (error) + return error; + + if (rp->parents > 0 && xfs_inode_on_unlinked_list(ip)) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + + /* + * The file is on the unlinked list but we found parents. + * Remove the file from the unlinked list. + */ + pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, ip->i_ino)); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xfs_iunlink_remove(sc->tp, pag, ip); + xfs_perag_put(pag); + if (error) + return error; + } else if (rp->parents == 0 && !xfs_inode_on_unlinked_list(ip)) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + + /* + * The file is not on the unlinked list but we found no + * parents. Add the file to the unlinked list. + */ + error = xfs_iunlink(sc->tp, ip); + if (error) + return error; + } + + /* Set the correct link count. */ + if (VFS_I(ip)->i_nlink != rp->parents) { + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + set_nlink(VFS_I(ip), min_t(unsigned long long, rp->parents, + XFS_NLINK_PINNED)); + } + + /* Log the inode to keep it moving forward if we dirtied anything. */ + if (joined) + xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); + return 0; +} + +/* Set up the filesystem scan so we can look for parents. */ +STATIC int +xrep_parent_setup_scan( + struct xrep_parent *rp) +{ + struct xfs_scrub *sc = rp->sc; + char *descr; + struct xfs_da_geometry *geo = sc->mp->m_attr_geo; + int max_len; + int error; + + if (!xfs_has_parent(sc->mp)) + return xrep_findparent_scan_start(sc, &rp->pscan); + + /* Buffers for copying non-pptr attrs to the tempfile */ + rp->xattr_name = kvmalloc(XATTR_NAME_MAX + 1, XCHK_GFP_FLAGS); + if (!rp->xattr_name) + return -ENOMEM; + + /* + * Allocate enough memory to handle loading local attr values from the + * xfblob data while flushing stashed attrs to the temporary file. + * We only realloc the buffer when salvaging remote attr values, so + * TRY_HARDER means we allocate the maximal attr value size. + */ + if (sc->flags & XCHK_TRY_HARDER) + max_len = XATTR_SIZE_MAX; + else + max_len = xfs_attr_leaf_entsize_local_max(geo->blksize); + error = xrep_parent_alloc_xattr_value(rp, max_len); + if (error) + goto out_xattr_name; + + /* Set up some staging memory for logging parent pointer updates. */ + descr = xchk_xfile_ino_descr(sc, "parent pointer entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_pptr), + &rp->pptr_recs); + kfree(descr); + if (error) + goto out_xattr_value; + + descr = xchk_xfile_ino_descr(sc, "parent pointer names"); + error = xfblob_create(descr, &rp->pptr_names); + kfree(descr); + if (error) + goto out_recs; + + /* Set up some storage for copying attrs before the mapping exchange */ + descr = xchk_xfile_ino_descr(sc, + "parent pointer retained xattr entries"); + error = xfarray_create(descr, 0, sizeof(struct xrep_parent_xattr), + &rp->xattr_records); + kfree(descr); + if (error) + goto out_names; + + descr = xchk_xfile_ino_descr(sc, + "parent pointer retained xattr values"); + error = xfblob_create(descr, &rp->xattr_blobs); + kfree(descr); + if (error) + goto out_attr_keys; + + error = __xrep_findparent_scan_start(sc, &rp->pscan, + xrep_parent_live_update); + if (error) + goto out_attr_values; + + return 0; + +out_attr_values: + xfblob_destroy(rp->xattr_blobs); + rp->xattr_blobs = NULL; +out_attr_keys: + xfarray_destroy(rp->xattr_records); + rp->xattr_records = NULL; +out_names: + xfblob_destroy(rp->pptr_names); + rp->pptr_names = NULL; +out_recs: + xfarray_destroy(rp->pptr_recs); + rp->pptr_recs = NULL; +out_xattr_value: + kvfree(rp->xattr_value); + rp->xattr_value = NULL; +out_xattr_name: + kvfree(rp->xattr_name); + rp->xattr_name = NULL; + return error; +} + +int +xrep_parent( + struct xfs_scrub *sc) +{ + struct xrep_parent *rp = sc->buf; + int error; + + /* + * When the parent pointers feature is enabled, repairs are committed + * by atomically committing a new xattr structure and reaping the old + * attr fork. Reaping requires rmap and exchange-range to be enabled. + */ + if (xfs_has_parent(sc->mp)) { + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + } + + error = xrep_parent_setup_scan(rp); + if (error) + return error; + + if (xfs_has_parent(sc->mp)) + error = xrep_parent_scan_dirtree(rp); + else + error = xrep_parent_find_dotdot(rp); + if (error) + goto out_teardown; + + /* Last chance to abort before we start committing dotdot fixes. */ + if (xchk_should_terminate(sc, &error)) + goto out_teardown; + + error = xrep_parent_rebuild_tree(rp); + if (error) + goto out_teardown; + if (xfs_has_parent(sc->mp) && !S_ISDIR(VFS_I(sc->ip)->i_mode)) { + error = xrep_parent_set_nondir_nlink(rp); + if (error) + goto out_teardown; + } + + error = xrep_defer_finish(sc); + +out_teardown: + xrep_parent_teardown(rp); + return error; +} diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index 0bab4c30cb85..90cd1512bba9 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -77,8 +77,6 @@ xrep_quota_item_fill_bmap_hole( irec, &nmaps); if (error) return error; - if (nmaps != 1) - return -ENOSPC; dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock); @@ -444,10 +442,6 @@ xrep_quota_data_fork( XFS_BMAPI_CONVERT, 0, &nrec, &nmap); if (error) goto out; - if (nmap != 1) { - error = -ENOSPC; - goto out; - } ASSERT(nrec.br_startoff == irec.br_startoff); ASSERT(nrec.br_blockcount == irec.br_blockcount); diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c index dfdcb96b6c16..01c9a2dc0f2c 100644 --- a/fs/xfs/scrub/readdir.c +++ b/fs/xfs/scrub/readdir.c @@ -18,6 +18,7 @@ #include "xfs_trans.h" #include "xfs_error.h" #include "scrub/scrub.h" +#include "scrub/common.h" #include "scrub/readdir.h" /* Call a function for every entry in a shortform directory. */ @@ -99,7 +100,7 @@ xchk_dir_walk_block( unsigned int off, next_off, end; int error; - error = xfs_dir3_block_read(sc->tp, dp, &bp); + error = xfs_dir3_block_read(sc->tp, dp, dp->i_ino, &bp); if (error) return error; @@ -175,7 +176,7 @@ xchk_read_leaf_dir_buf( if (new_off > *curoff) *curoff = new_off; - return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp); + return xfs_dir3_data_read(tp, dp, dp->i_ino, map.br_startoff, 0, bpp); } /* Call a function for every entry in a leaf directory. */ @@ -273,8 +274,8 @@ xchk_dir_walk( .dp = dp, .geo = dp->i_mount->m_dir_geo, .trans = sc->tp, + .owner = dp->i_ino, }; - bool isblock; int error; if (xfs_is_shutdown(dp->i_mount)) @@ -283,22 +284,17 @@ xchk_dir_walk( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + switch (xfs_dir2_format(&args, &error)) { + case XFS_DIR2_FMT_SF: return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); - - /* dir2 functions require that the data fork is loaded */ - error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); - if (error) - return error; - - error = xfs_dir2_isblock(&args, &isblock); - if (error) - return error; - - if (isblock) + case XFS_DIR2_FMT_BLOCK: return xchk_dir_walk_block(sc, dp, dirent_fn, priv); - - return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); + case XFS_DIR2_FMT_LEAF: + case XFS_DIR2_FMT_NODE: + return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); + default: + return error; + } } /* @@ -324,50 +320,102 @@ xchk_dir_lookup( .hashval = xfs_dir2_hashname(dp->i_mount, name), .whichfork = XFS_DATA_FORK, .op_flags = XFS_DA_OP_OKNOENT, + .owner = dp->i_ino, }; - bool isblock, isleaf; int error; if (xfs_is_shutdown(dp->i_mount)) return -EIO; + /* + * A temporary directory's block headers are written with the owner + * set to sc->ip, so we must switch the owner here for the lookup. + */ + if (dp == sc->tempip) + args.owner = sc->ip->i_ino; + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - error = xfs_dir2_sf_lookup(&args); - goto out_check_rval; - } + error = xfs_dir_lookup_args(&args); + if (!error) + *ino = args.inumber; + return error; +} - /* dir2 functions require that the data fork is loaded */ - error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); - if (error) - return error; +/* + * Try to grab the IOLOCK and ILOCK of sc->ip and ip, returning @ip's lock + * state. The caller may have a transaction, so we must use trylock for both + * IOLOCKs. + */ +static inline unsigned int +xchk_dir_trylock_both( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!xchk_ilock_nowait(sc, XFS_IOLOCK_EXCL)) + return 0; - error = xfs_dir2_isblock(&args, &isblock); - if (error) - return error; + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + goto parent_iolock; - if (isblock) { - error = xfs_dir2_block_lookup(&args); - goto out_check_rval; - } + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + goto parent_ilock; - error = xfs_dir2_isleaf(&args, &isleaf); - if (error) - return error; + return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL; + +parent_ilock: + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); +parent_iolock: + xchk_iunlock(sc, XFS_IOLOCK_EXCL); + return 0; +} + +/* + * Try for a limited time to grab the IOLOCK and ILOCK of both the scrub target + * (@sc->ip) and the inode at the other end (@ip) of a directory or parent + * pointer link so that we can check that link. + * + * We do not know ahead of time that the directory tree is /not/ corrupt, so we + * cannot use the "lock two inode" functions because we do not know that there + * is not a racing thread trying to take the locks in opposite order. First + * take IOLOCK_EXCL of the scrub target, and then try to take IOLOCK_SHARED + * of @ip to synchronize with the VFS. Next, take ILOCK_EXCL of the scrub + * target and @ip to synchronize with XFS. + * + * If the trylocks succeed, *lockmode will be set to the locks held for @ip; + * @sc->ilock_flags will be set for the locks held for @sc->ip; and zero will + * be returned. If not, returns -EDEADLOCK to try again; or -ETIMEDOUT if + * XCHK_TRY_HARDER was set. Returns -EINTR if the process has been killed. + */ +int +xchk_dir_trylock_for_pptrs( + struct xfs_scrub *sc, + struct xfs_inode *ip, + unsigned int *lockmode) +{ + unsigned int nr; + int error = 0; + + ASSERT(sc->ilock_flags == 0); + + for (nr = 0; nr < HZ; nr++) { + *lockmode = xchk_dir_trylock_both(sc, ip); + if (*lockmode) + return 0; - if (isleaf) { - error = xfs_dir2_leaf_lookup(&args); - goto out_check_rval; + if (xchk_should_terminate(sc, &error)) + return error; + + delay(1); } - error = xfs_dir2_node_lookup(&args); + if (sc->flags & XCHK_TRY_HARDER) { + xchk_set_incomplete(sc); + return -ETIMEDOUT; + } -out_check_rval: - if (error == -EEXIST) - error = 0; - if (!error) - *ino = args.inumber; - return error; + return -EDEADLOCK; } diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h index 55787f4df123..da501877a64d 100644 --- a/fs/xfs/scrub/readdir.h +++ b/fs/xfs/scrub/readdir.h @@ -16,4 +16,7 @@ int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp, int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *ino); +int xchk_dir_trylock_for_pptrs(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int *lockmode); + #endif /* __XFS_SCRUB_READDIR_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 0252a3b5b65a..be283153c254 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -211,6 +211,48 @@ static inline void xreap_defer_finish_reset(struct xreap_state *rs) rs->force_roll = false; } +/* + * Compute the maximum length of a buffer cache scan (in units of sectors), + * given a quantity of fs blocks. + */ +xfs_daddr_t +xrep_bufscan_max_sectors( + struct xfs_mount *mp, + xfs_extlen_t fsblocks) +{ + int max_fsbs; + + /* Remote xattr values are the largest buffers that we support. */ + max_fsbs = xfs_attr3_max_rmt_blocks(mp); + + return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); +} + +/* + * Return an incore buffer from a sector scan, or NULL if there are no buffers + * left to return. + */ +struct xfs_buf * +xrep_bufscan_advance( + struct xfs_mount *mp, + struct xrep_bufscan *scan) +{ + scan->__sector_count += scan->daddr_step; + while (scan->__sector_count <= scan->max_sectors) { + struct xfs_buf *bp = NULL; + int error; + + error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr, + scan->__sector_count, XBF_LIVESCAN, &bp); + if (!error) + return bp; + + scan->__sector_count += scan->daddr_step; + } + + return NULL; +} + /* Try to invalidate the incore buffers for an extent that we're freeing. */ STATIC void xreap_agextent_binval( @@ -241,28 +283,15 @@ xreap_agextent_binval( * of any plausible size. */ while (bno < agbno_next) { - xfs_agblock_t fsbcount; - xfs_agblock_t max_fsbs; - - /* - * Max buffer size is the max remote xattr buffer size, which - * is one fs block larger than 64k. - */ - max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, - xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); - - for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) { - struct xfs_buf *bp = NULL; - xfs_daddr_t daddr; - int error; - - daddr = XFS_AGB_TO_DADDR(mp, agno, bno); - error = xfs_buf_incore(mp->m_ddev_targp, daddr, - XFS_FSB_TO_BB(mp, fsbcount), - XBF_LIVESCAN, &bp); - if (error) - continue; - + struct xrep_bufscan scan = { + .daddr = XFS_AGB_TO_DADDR(mp, agno, bno), + .max_sectors = xrep_bufscan_max_sectors(mp, + agbno_next - bno), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { xfs_trans_bjoin(sc->tp, bp); xfs_trans_binval(sc->tp, bp); rs->invalidated++; @@ -646,3 +675,375 @@ xrep_reap_fsblocks( return 0; } + +/* + * Metadata files are not supposed to share blocks with anything else. + * If blocks are shared, we remove the reverse mapping (thus reducing the + * crosslink factor); if blocks are not shared, we also need to free them. + * + * This first step determines the longest subset of the passed-in imap + * (starting at its beginning) that is either crosslinked or not crosslinked. + * The blockcount will be adjust down as needed. + */ +STATIC int +xreap_bmapi_select( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap, + bool *crosslinked) +{ + struct xfs_owner_info oinfo; + struct xfs_btree_cur *cur; + xfs_filblks_t len = 1; + xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_agblock_t agbno_next; + int error; + + agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); + agbno_next = agbno + imap->br_blockcount; + + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + + xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); + error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); + if (error) + goto out_cur; + + bno = agbno + 1; + while (bno < agbno_next) { + bool also_crosslinked; + + oinfo.oi_offset++; + error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (also_crosslinked != *crosslinked) + break; + + len++; + bno++; + } + + imap->br_blockcount = len; + trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Decide if this buffer can be joined to a transaction. This is true for most + * buffers, but there are two cases that we want to catch: large remote xattr + * value buffers are not logged and can overflow the buffer log item dirty + * bitmap size; and oversized cached buffers if things have really gone + * haywire. + */ +static inline bool +xreap_buf_loggable( + const struct xfs_buf *bp) +{ + int i; + + for (i = 0; i < bp->b_map_count; i++) { + int chunks; + int map_size; + + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + if (map_size > XFS_BLF_DATAMAP_SIZE) + return false; + } + + return true; +} + +/* + * Invalidate any buffers for this file mapping. The @imap blockcount may be + * adjusted downward if we need to roll the transaction. + */ +STATIC int +xreap_bmapi_binval( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; + int bmap_flags = xfs_bmapi_aflag(whichfork); + xfs_fileoff_t off; + xfs_fileoff_t max_off; + xfs_extlen_t scan_blocks; + xfs_agnumber_t agno = sc->sa.pag->pag_agno; + xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_agblock_t agbno_next; + unsigned int invalidated = 0; + int error; + + /* + * Avoid invalidating AG headers and post-EOFS blocks because we never + * own those. + */ + agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); + agbno_next = agbno + imap->br_blockcount; + if (!xfs_verify_agbno(pag, agbno) || + !xfs_verify_agbno(pag, agbno_next - 1)) + return 0; + + /* + * Buffers for file blocks can span multiple contiguous mappings. This + * means that for each block in the mapping, there could exist an + * xfs_buf indexed by that block with any length up to the maximum + * buffer size (remote xattr values) or to the next hole in the fork. + * To set up our binval scan, first we need to figure out the location + * of the next hole. + */ + off = imap->br_startoff + imap->br_blockcount; + max_off = off + xfs_attr3_max_rmt_blocks(mp); + while (off < max_off) { + struct xfs_bmbt_irec hmap; + int nhmaps = 1; + + error = xfs_bmapi_read(ip, off, max_off - off, &hmap, + &nhmaps, bmap_flags); + if (error) + return error; + if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(0); + return -EFSCORRUPTED; + } + + if (!xfs_bmap_is_real_extent(&hmap)) + break; + + off = hmap.br_startoff + hmap.br_blockcount; + } + scan_blocks = off - imap->br_startoff; + + trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks); + + /* + * If there are incore buffers for these blocks, invalidate them. If + * we can't (try)lock the buffer we assume it's owned by someone else + * and leave it alone. The buffer cache cannot detect aliasing, so + * employ nested loops to detect incore buffers of any plausible size. + */ + while (bno < agbno_next) { + struct xrep_bufscan scan = { + .daddr = XFS_AGB_TO_DADDR(mp, agno, bno), + .max_sectors = xrep_bufscan_max_sectors(mp, + scan_blocks), + .daddr_step = XFS_FSB_TO_BB(mp, 1), + }; + struct xfs_buf *bp; + + while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { + if (xreap_buf_loggable(bp)) { + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + } else { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + invalidated++; + + /* + * Stop invalidating if we've hit the limit; we should + * still have enough reservation left to free however + * much of the mapping we've seen so far. + */ + if (invalidated > XREAP_MAX_BINVAL) { + imap->br_blockcount = agbno_next - bno; + goto out; + } + } + + bno++; + scan_blocks--; + } + +out: + trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount); + return 0; +} + +/* + * Dispose of as much of the beginning of this file fork mapping as possible. + * The number of blocks disposed of is returned in @imap->br_blockcount. + */ +STATIC int +xrep_reap_bmapi_iter( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap, + bool crosslinked) +{ + int error; + + if (crosslinked) { + /* + * If there are other rmappings, this block is cross linked and + * must not be freed. Remove the reverse mapping, leave the + * buffer cache in its possibly confused state, and move on. + * We don't want to risk discarding valid data buffers from + * anybody else who thinks they own the block, even though that + * runs the risk of stale buffer warnings in the future. + */ + trace_xreap_dispose_unmap_extent(sc->sa.pag, + XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), + imap->br_blockcount); + + /* + * Schedule removal of the mapping from the fork. We use + * deferred log intents in this function to control the exact + * sequence of metadata updates. + */ + xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + -(int64_t)imap->br_blockcount); + xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); + return 0; + } + + /* + * If the block is not crosslinked, we can invalidate all the incore + * buffers for the extent, and then free the extent. This is a bit of + * a mess since we don't detect discontiguous buffers that are indexed + * by a block starting before the first block of the extent but overlap + * anyway. + */ + trace_xreap_dispose_free_extent(sc->sa.pag, + XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), + imap->br_blockcount); + + /* + * Invalidate as many buffers as we can, starting at the beginning of + * this mapping. If this function sets blockcount to zero, the + * transaction is full of logged buffer invalidations, so we need to + * return early so that we can roll and retry. + */ + error = xreap_bmapi_binval(sc, ip, whichfork, imap); + if (error || imap->br_blockcount == 0) + return error; + + /* + * Schedule removal of the mapping from the fork. We use deferred log + * intents in this function to control the exact sequence of metadata + * updates. + */ + xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + -(int64_t)imap->br_blockcount); + return xfs_free_extent_later(sc->tp, imap->br_startblock, + imap->br_blockcount, NULL, XFS_AG_RESV_NONE, true); +} + +/* + * Dispose of as much of this file extent as we can. Upon successful return, + * the imap will reflect the mapping that was removed from the fork. + */ +STATIC int +xreap_ifork_extent( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *imap) +{ + xfs_agnumber_t agno; + bool crosslinked; + int error; + + ASSERT(sc->sa.pag == NULL); + + trace_xreap_ifork_extent(sc, ip, whichfork, imap); + + agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); + sc->sa.pag = xfs_perag_get(sc->mp, agno); + if (!sc->sa.pag) + return -EFSCORRUPTED; + + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); + if (error) + goto out_pag; + + /* + * Decide the fate of the blocks at the beginning of the mapping, then + * update the mapping to use it with the unmap calls. + */ + error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); + if (error) + goto out_agf; + + error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); + if (error) + goto out_agf; + +out_agf: + xfs_trans_brelse(sc->tp, sc->sa.agf_bp); + sc->sa.agf_bp = NULL; +out_pag: + xfs_perag_put(sc->sa.pag); + sc->sa.pag = NULL; + return error; +} + +/* + * Dispose of each block mapped to the given fork of the given file. Callers + * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork + * must not have any delalloc reservations. + */ +int +xrep_reap_ifork( + struct xfs_scrub *sc, + struct xfs_inode *ip, + int whichfork) +{ + xfs_fileoff_t off = 0; + int bmap_flags = xfs_bmapi_aflag(whichfork); + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(ip == sc->ip || ip == sc->tempip); + ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); + + while (off < XFS_MAX_FILEOFF) { + struct xfs_bmbt_irec imap; + int nimaps = 1; + + /* Read the next extent, skip past holes and delalloc. */ + error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap, + &nimaps, bmap_flags); + if (error) + return error; + if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(0); + return -EFSCORRUPTED; + } + + /* + * If this is a real space mapping, reap as much of it as we + * can in a single transaction. + */ + if (xfs_bmap_is_real_extent(&imap)) { + error = xreap_ifork_extent(sc, ip, whichfork, &imap); + if (error) + return error; + + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + off = imap.br_startoff + imap.br_blockcount; + } + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index 0b69f16dd98f..3f2f1775e29d 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -13,5 +13,26 @@ int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, const struct xfs_owner_info *oinfo); +int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork); + +/* Buffer cache scan context. */ +struct xrep_bufscan { + /* Disk address for the buffers we want to scan. */ + xfs_daddr_t daddr; + + /* Maximum number of sectors to scan. */ + xfs_daddr_t max_sectors; + + /* Each round, increment the search length by this number of sectors. */ + xfs_daddr_t daddr_step; + + /* Internal scan state; initialize to zero. */ + xfs_daddr_t __sector_count; +}; + +xfs_daddr_t xrep_bufscan_max_sectors(struct xfs_mount *mp, + xfs_extlen_t fsblocks); +struct xfs_buf *xrep_bufscan_advance(struct xfs_mount *mp, + struct xrep_bufscan *scan); #endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index f43dce771cdd..67478294f11a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -32,6 +32,10 @@ #include "xfs_reflink.h" #include "xfs_health.h" #include "xfs_buf_mem.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_dir2.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -39,6 +43,7 @@ #include "scrub/bitmap.h" #include "scrub/stats.h" #include "scrub/xfile.h" +#include "scrub/attr_repair.h" /* * Attempt to repair some metadata, if the metadata is corrupt and userspace @@ -290,7 +295,7 @@ xrep_calc_ag_resblks( icount = pag->pagi_count; } else { /* Try to get the actual counters from disk. */ - error = xfs_ialloc_read_agi(pag, NULL, &bp); + error = xfs_ialloc_read_agi(pag, NULL, 0, &bp); if (!error) { icount = pag->pagi_count; xfs_buf_relse(bp); @@ -724,7 +729,7 @@ xrep_update_qflags( xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1); no_update: - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); } /* Force a quotacheck the next time we mount. */ @@ -908,7 +913,7 @@ xrep_reinit_pagi( ASSERT(xfs_perag_initialised_agi(pag)); clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); - error = xfs_ialloc_read_agi(pag, sc->tp, &bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &bp); if (error) return error; @@ -934,7 +939,7 @@ xrep_ag_init( ASSERT(!sa->pag); - error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, 0, &sa->agi_bp); if (error) return error; @@ -963,9 +968,7 @@ xrep_reset_perag_resv( ASSERT(sc->tp); sc->flags &= ~XREP_RESET_PERAG_RESV; - error = xfs_ag_resv_free(sc->sa.pag); - if (error) - goto out; + xfs_ag_resv_free(sc->sa.pag); error = xfs_ag_resv_init(sc->sa.pag, sc->tp); if (error == -ENOSPC) { xfs_err(sc->mp, @@ -974,7 +977,6 @@ xrep_reset_perag_resv( error = 0; } -out: return error; } @@ -1004,55 +1006,27 @@ xrep_metadata_inode_subtype( struct xfs_scrub *sc, unsigned int scrub_type) { - __u32 smtype = sc->sm->sm_type; - __u32 smflags = sc->sm->sm_flags; - unsigned int sick_mask = sc->sick_mask; + struct xfs_scrub_subord *sub; int error; /* - * Let's see if the inode needs repair. We're going to open-code calls - * to the scrub and repair functions so that we can hang on to the + * Let's see if the inode needs repair. Use a subordinate scrub context + * to call the scrub and repair functions so that we can hang on to the * resources that we already acquired instead of using the standard * setup/teardown routines. */ - sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; - sc->sm->sm_type = scrub_type; - - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xchk_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xchk_bmap_data(sc); - break; - case XFS_SCRUB_TYPE_BMBTA: - error = xchk_bmap_attr(sc); - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - } + sub = xchk_scrub_create_subord(sc, scrub_type); + error = sub->sc.ops->scrub(&sub->sc); if (error) goto out; - - if (!xrep_will_attempt(sc)) + if (!xrep_will_attempt(&sub->sc)) goto out; /* * Repair some part of the inode. This will potentially join the inode * to the transaction. */ - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xrep_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xrep_bmap(sc, XFS_DATA_FORK, false); - break; - case XFS_SCRUB_TYPE_BMBTA: - error = xrep_bmap(sc, XFS_ATTR_FORK, false); - break; - } + error = sub->sc.ops->repair(&sub->sc); if (error) goto out; @@ -1061,10 +1035,10 @@ xrep_metadata_inode_subtype( * that the inode will not be joined to the transaction when we exit * the function. */ - error = xfs_defer_finish(&sc->tp); + error = xfs_defer_finish(&sub->sc.tp); if (error) goto out; - error = xfs_trans_roll(&sc->tp); + error = xfs_trans_roll(&sub->sc.tp); if (error) goto out; @@ -1072,31 +1046,18 @@ xrep_metadata_inode_subtype( * Clear the corruption flags and re-check the metadata that we just * repaired. */ - sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; - - switch (scrub_type) { - case XFS_SCRUB_TYPE_INODE: - error = xchk_inode(sc); - break; - case XFS_SCRUB_TYPE_BMBTD: - error = xchk_bmap_data(sc); - break; - case XFS_SCRUB_TYPE_BMBTA: - error = xchk_bmap_attr(sc); - break; - } + sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + error = sub->sc.ops->scrub(&sub->sc); if (error) goto out; /* If corruption persists, the repair has failed. */ - if (xchk_needs_repair(sc->sm)) { + if (xchk_needs_repair(sub->sc.sm)) { error = -EFSCORRUPTED; goto out; } out: - sc->sick_mask = sick_mask; - sc->sm->sm_type = smtype; - sc->sm->sm_flags = smflags; + xchk_scrub_free_subord(sub); return error; } @@ -1136,6 +1097,17 @@ xrep_metadata_inode_forks( return error; } + /* Clear the attr forks since metadata shouldn't have that. */ + if (xfs_inode_hasattr(sc->ip)) { + if (!dirty) { + dirty = true; + xfs_trans_ijoin(sc->tp, sc->ip, 0); + } + error = xrep_xattr_reset_fork(sc); + if (error) + return error; + } + /* * If we modified the inode, roll the transaction but don't rejoin the * inode to the new transaction because xrep_bmap_data can do that. @@ -1201,3 +1173,34 @@ xrep_trans_cancel_hook_dummy( current->journal_info = *cookiep; *cookiep = NULL; } + +/* + * See if this buffer can pass the given ->verify_struct() function. + * + * If the buffer already has ops attached and they're not the ones that were + * passed in, we reject the buffer. Otherwise, we perform the structure test + * (note that we do not check CRCs) and return the outcome of the test. The + * buffer ops and error state are left unchanged. + */ +bool +xrep_buf_verify_struct( + struct xfs_buf *bp, + const struct xfs_buf_ops *ops) +{ + const struct xfs_buf_ops *old_ops = bp->b_ops; + xfs_failaddr_t fa; + int old_error; + + if (old_ops) { + if (old_ops != ops) + return false; + } + + old_error = bp->b_error; + bp->b_ops = ops; + fa = bp->b_ops->verify_struct(bp); + bp->b_ops = old_ops; + bp->b_error = old_error; + + return fa == NULL; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index ce082d941459..0e0dc2bf985c 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -90,6 +90,12 @@ int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); int xrep_metadata_inode_forks(struct xfs_scrub *sc); int xrep_setup_ag_rmapbt(struct xfs_scrub *sc); int xrep_setup_ag_refcountbt(struct xfs_scrub *sc); +int xrep_setup_xattr(struct xfs_scrub *sc); +int xrep_setup_directory(struct xfs_scrub *sc); +int xrep_setup_parent(struct xfs_scrub *sc); +int xrep_setup_nlinks(struct xfs_scrub *sc); +int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks); +int xrep_setup_dirtree(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); @@ -123,11 +129,18 @@ int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_bmap_cow(struct xfs_scrub *sc); int xrep_nlinks(struct xfs_scrub *sc); int xrep_fscounters(struct xfs_scrub *sc); +int xrep_xattr(struct xfs_scrub *sc); +int xrep_directory(struct xfs_scrub *sc); +int xrep_parent(struct xfs_scrub *sc); +int xrep_symlink(struct xfs_scrub *sc); +int xrep_dirtree(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xrep_rtbitmap(struct xfs_scrub *sc); +int xrep_rtsummary(struct xfs_scrub *sc); #else # define xrep_rtbitmap xrep_notsupported +# define xrep_rtsummary xrep_notsupported #endif /* CONFIG_XFS_RT */ #ifdef CONFIG_XFS_QUOTA @@ -145,6 +158,8 @@ int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep, struct xfs_trans **tpp); void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); +bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops); + #else #define xrep_ino_dqattach(sc) (0) @@ -188,9 +203,19 @@ xrep_setup_nothing( #define xrep_setup_ag_allocbt xrep_setup_nothing #define xrep_setup_ag_rmapbt xrep_setup_nothing #define xrep_setup_ag_refcountbt xrep_setup_nothing +#define xrep_setup_xattr xrep_setup_nothing +#define xrep_setup_directory xrep_setup_nothing +#define xrep_setup_parent xrep_setup_nothing +#define xrep_setup_nlinks xrep_setup_nothing +#define xrep_setup_dirtree xrep_setup_nothing #define xrep_setup_inode(sc, imap) ((void)0) +static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x) +{ + return 0; +} + #define xrep_revalidate_allocbt (NULL) #define xrep_revalidate_iallocbt (NULL) @@ -212,6 +237,12 @@ xrep_setup_nothing( #define xrep_quotacheck xrep_notsupported #define xrep_nlinks xrep_notsupported #define xrep_fscounters xrep_notsupported +#define xrep_rtsummary xrep_notsupported +#define xrep_xattr xrep_notsupported +#define xrep_directory xrep_notsupported +#define xrep_parent xrep_notsupported +#define xrep_symlink xrep_notsupported +#define xrep_dirtree xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c index e8e07b683eab..e8080eba37d2 100644 --- a/fs/xfs/scrub/rmap_repair.c +++ b/fs/xfs/scrub/rmap_repair.c @@ -432,14 +432,6 @@ out: return error; } -static inline bool -is_rt_data_fork( - struct xfs_inode *ip, - int whichfork) -{ - return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK; -} - /* * Iterate the block mapping btree to collect rmap records for anything in this * fork that matches the AG. Sets @mappings_done to true if we've scanned the @@ -578,23 +570,9 @@ xrep_rmap_scan_inode( struct xrep_rmap *rr, struct xfs_inode *ip) { - unsigned int lock_mode = 0; + unsigned int lock_mode = xrep_rmap_scan_ilock(ip); int error; - /* - * Directory updates (create/link/unlink/rename) drop the directory's - * ILOCK before finishing any rmapbt updates associated with directory - * shape changes. For this scan to coordinate correctly with the live - * update hook, we must take the only lock (i_rwsem) that is held all - * the way to dir op completion. This will get fixed by the parent - * pointer patchset. - */ - if (S_ISDIR(VFS_I(ip)->i_mode)) { - lock_mode = XFS_IOLOCK_SHARED; - xfs_ilock(ip, lock_mode); - } - lock_mode |= xrep_rmap_scan_ilock(ip); - /* Check the data fork. */ error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK); if (error) diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c index 46f5d5f605c9..0fef98e9f834 100644 --- a/fs/xfs/scrub/rtbitmap_repair.c +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -108,8 +108,6 @@ xrep_rtbitmap_data_mappings( 0, &map, &nmaps); if (error) return error; - if (nmaps != 1) - return -EFSCORRUPTED; /* Commit new extent and all deferred work. */ error = xrep_defer_finish(sc); diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 5055092bd9e8..3fee603f5244 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -17,10 +17,14 @@ #include "xfs_bit.h" #include "xfs_bmap.h" #include "xfs_sb.h" +#include "xfs_exchmaps.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/xfile.h" +#include "scrub/repair.h" +#include "scrub/tempexch.h" +#include "scrub/rtsummary.h" /* * Realtime Summary @@ -32,18 +36,6 @@ * (potentially large) amount of data in pageable memory. */ -struct xchk_rtsummary { - struct xfs_rtalloc_args args; - - uint64_t rextents; - uint64_t rbmblocks; - uint64_t rsumsize; - unsigned int rsumlevels; - - /* Memory buffer for the summary comparison. */ - union xfs_suminfo_raw words[]; -}; - /* Set us up to check the rtsummary file. */ int xchk_setup_rtsummary( @@ -60,6 +52,12 @@ xchk_setup_rtsummary( return -ENOMEM; sc->buf = rts; + if (xchk_could_repair(sc)) { + error = xrep_setup_rtsummary(sc, rts); + if (error) + return error; + } + /* * Create an xfile to construct a new rtsummary file. The xfile allows * us to avoid pinning kernel memory for this purpose. @@ -70,7 +68,7 @@ xchk_setup_rtsummary( if (error) return error; - error = xchk_trans_alloc(sc, 0); + error = xchk_trans_alloc(sc, rts->resblks); if (error) return error; @@ -135,7 +133,7 @@ xfsum_store( sumoff << XFS_WORDLOG); } -static inline int +inline int xfsum_copyout( struct xfs_scrub *sc, xfs_rtsumoff_t sumoff, @@ -362,7 +360,12 @@ xchk_rtsummary( error = xchk_rtsum_compare(sc); out_rbm: - /* Unlock the rtbitmap since we're done with it. */ + /* + * Unlock the rtbitmap since we're done with it. All other writers of + * the rt free space metadata grab the bitmap and summary ILOCKs in + * that order, so we're still protected against allocation activities + * even if we continue on to the repair function. + */ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } diff --git a/fs/xfs/scrub/rtsummary.h b/fs/xfs/scrub/rtsummary.h new file mode 100644 index 000000000000..e1d50304d8d4 --- /dev/null +++ b/fs/xfs/scrub/rtsummary.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTSUMMARY_H__ +#define __XFS_SCRUB_RTSUMMARY_H__ + +struct xchk_rtsummary { +#ifdef CONFIG_XFS_ONLINE_REPAIR + struct xrep_tempexch tempexch; +#endif + struct xfs_rtalloc_args args; + + uint64_t rextents; + uint64_t rbmblocks; + uint64_t rsumsize; + unsigned int rsumlevels; + unsigned int resblks; + + /* suminfo position of xfile as we write buffers to disk. */ + xfs_rtsumoff_t prep_wordoff; + + /* Memory buffer for the summary comparison. */ + union xfs_suminfo_raw words[]; +}; + +int xfsum_copyout(struct xfs_scrub *sc, xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo, unsigned int nr_words); + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtsummary(struct xfs_scrub *sc, struct xchk_rtsummary *rts); +#else +# define xrep_setup_rtsummary(sc, rts) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTSUMMARY_H__ */ diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c new file mode 100644 index 000000000000..d9e971c4c79f --- /dev/null +++ b/fs/xfs/scrub/rtsummary_repair.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_exchmaps.h" +#include "xfs_rtbitmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/reap.h" +#include "scrub/xfile.h" +#include "scrub/rtsummary.h" + +/* Set us up to repair the rtsummary file. */ +int +xrep_setup_rtsummary( + struct xfs_scrub *sc, + struct xchk_rtsummary *rts) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks; + int error; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + /* + * If we're doing a repair, we reserve enough blocks to write out a + * completely new summary file, plus twice as many blocks as we would + * need if we can only allocate one block per data fork mapping. This + * should cover the preallocation of the temporary file and exchanging + * the extent mappings. + * + * We cannot use xfs_exchmaps_estimate because we have not yet + * constructed the replacement rtsummary and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * rtsummary ILOCK) and cannot ask for more reservation. + */ + blocks = XFS_B_TO_FSB(mp, mp->m_rsumsize); + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rts->resblks += blocks; + return 0; +} + +static int +xrep_rtsummary_prep_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp, + void *data) +{ + struct xchk_rtsummary *rts = data; + struct xfs_mount *mp = sc->mp; + union xfs_suminfo_raw *ondisk; + int error; + + rts->args.mp = sc->mp; + rts->args.tp = sc->tp; + rts->args.sumbp = bp; + ondisk = xfs_rsumblock_infoptr(&rts->args, 0); + rts->args.sumbp = NULL; + + bp->b_ops = &xfs_rtbuf_ops; + + error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize); + if (error) + return error; + + rts->prep_wordoff += mp->m_blockwsize; + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF); + return 0; +} + +/* Repair the realtime summary. */ +int +xrep_rtsummary( + struct xfs_scrub *sc) +{ + struct xchk_rtsummary *rts = sc->buf; + struct xfs_mount *mp = sc->mp; + xfs_filblks_t rsumblocks; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(mp)) + return -EOPNOTSUPP; + + /* Walk away if we disagree on the size of the rt bitmap. */ + if (rts->rbmblocks != mp->m_sb.sb_rbmblocks) + return 0; + + /* Make sure any problems with the fork are fixed. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + /* + * Try to take ILOCK_EXCL of the temporary file. We had better be the + * only ones holding onto this inode, but we can't block while holding + * the rtsummary file's ILOCK_EXCL. + */ + while (!xrep_tempfile_ilock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + /* Make sure we have space allocated for the entire summary file. */ + rsumblocks = XFS_B_TO_FSB(mp, rts->rsumsize); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + error = xrep_tempfile_prealloc(sc, 0, rsumblocks); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* Copy the rtsummary file that we generated. */ + error = xrep_tempfile_copyin(sc, 0, rsumblocks, + xrep_rtsummary_prep_buf, rts); + if (error) + return error; + error = xrep_tempfile_set_isize(sc, rts->rsumsize); + if (error) + return error; + + /* + * Now exchange the contents. Nothing in repair uses the temporary + * buffer, so we can reuse it for the tempfile exchrange information. + */ + error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, &rts->tempexch); + if (error) + return error; + + error = xrep_tempexch_contents(sc, &rts->tempexch); + if (error) + return error; + + /* Reset incore state and blow out the summary cache. */ + if (mp->m_rsum_cache) + memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks); + + mp->m_rsumlevels = rts->rsumlevels; + mp->m_rsumsize = rts->rsumsize; + + /* Free the old rtsummary blocks if they're not in use. */ + return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 20fac9723c08..c013f0ba4f36 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -17,6 +17,11 @@ #include "xfs_scrub.h" #include "xfs_buf_mem.h" #include "xfs_rmap.h" +#include "xfs_exchrange.h" +#include "xfs_exchmaps.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" +#include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -24,6 +29,8 @@ #include "scrub/health.h" #include "scrub/stats.h" #include "scrub/xfile.h" +#include "scrub/tempfile.h" +#include "scrub/orphanage.h" /* * Online Scrub and Repair @@ -171,6 +178,39 @@ xchk_fsgates_disable( sc->flags &= ~XCHK_FSGATES_ALL; } +/* Free the resources associated with a scrub subtype. */ +void +xchk_scrub_free_subord( + struct xfs_scrub_subord *sub) +{ + struct xfs_scrub *sc = sub->parent_sc; + + ASSERT(sc->ip == sub->sc.ip); + ASSERT(sc->orphanage == sub->sc.orphanage); + ASSERT(sc->tempip == sub->sc.tempip); + + sc->sm->sm_type = sub->old_smtype; + sc->sm->sm_flags = sub->old_smflags | + (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT); + sc->tp = sub->sc.tp; + + if (sub->sc.buf) { + if (sub->sc.buf_cleanup) + sub->sc.buf_cleanup(sub->sc.buf); + kvfree(sub->sc.buf); + } + if (sub->sc.xmbtp) + xmbuf_free(sub->sc.xmbtp); + if (sub->sc.xfile) + xfile_destroy(sub->sc.xfile); + + sc->ilock_flags = sub->sc.ilock_flags; + sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags; + sc->temp_ilock_flags = sub->sc.temp_ilock_flags; + + kfree(sub); +} + /* Free all the resources and finish the transactions. */ STATIC int xchk_teardown( @@ -211,6 +251,8 @@ xchk_teardown( sc->buf = NULL; } + xrep_tempfile_rele(sc); + xrep_orphanage_rele(sc); xchk_fsgates_disable(sc); return error; } @@ -319,25 +361,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_INODE, .setup = xchk_setup_directory, .scrub = xchk_directory, - .repair = xrep_notsupported, + .repair = xrep_directory, }, [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ .type = ST_INODE, .setup = xchk_setup_xattr, .scrub = xchk_xattr, - .repair = xrep_notsupported, + .repair = xrep_xattr, }, [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ .type = ST_INODE, .setup = xchk_setup_symlink, .scrub = xchk_symlink, - .repair = xrep_notsupported, + .repair = xrep_symlink, }, [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ .type = ST_INODE, .setup = xchk_setup_parent, .scrub = xchk_parent, - .repair = xrep_notsupported, + .repair = xrep_parent, }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_FS, @@ -349,7 +391,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, - .repair = xrep_notsupported, + .repair = xrep_rtsummary, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS, @@ -393,6 +435,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .scrub = xchk_health_record, .repair = xrep_notsupported, }, + [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */ + .type = ST_INODE, + .setup = xchk_setup_dirtree, + .scrub = xchk_dirtree, + .has = xfs_has_parent, + .repair = xrep_dirtree, + }, }; static int @@ -497,8 +546,38 @@ static inline void xchk_postmortem(struct xfs_scrub *sc) } #endif /* CONFIG_XFS_ONLINE_REPAIR */ +/* + * Create a new scrub context from an existing one, but with a different scrub + * type. + */ +struct xfs_scrub_subord * +xchk_scrub_create_subord( + struct xfs_scrub *sc, + unsigned int subtype) +{ + struct xfs_scrub_subord *sub; + + sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS); + if (!sub) + return ERR_PTR(-ENOMEM); + + sub->old_smtype = sc->sm->sm_type; + sub->old_smflags = sc->sm->sm_flags; + sub->parent_sc = sc; + memcpy(&sub->sc, sc, sizeof(struct xfs_scrub)); + sub->sc.ops = &meta_scrub_ops[subtype]; + sub->sc.sm->sm_type = subtype; + sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + sub->sc.buf = NULL; + sub->sc.buf_cleanup = NULL; + sub->sc.xfile = NULL; + sub->sc.xmbtp = NULL; + + return sub; +} + /* Dispatch metadata scrubbing. */ -int +STATIC int xfs_scrub_metadata( struct file *file, struct xfs_scrub_metadata *sm) @@ -540,6 +619,7 @@ xfs_scrub_metadata( sc->sm = sm; sc->ops = &meta_scrub_ops[sm->sm_type]; sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); + sc->relax = INIT_XCHK_RELAX; retry_op: /* * When repairs are allowed, prevent freezing or readonly remount while @@ -643,3 +723,221 @@ try_harder: run.retries++; goto retry_op; } + +/* Scrub one aspect of one piece of metadata. */ +int +xfs_ioc_scrub_metadata( + struct file *file, + void __user *arg) +{ + struct xfs_scrub_metadata scrub; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&scrub, arg, sizeof(scrub))) + return -EFAULT; + + error = xfs_scrub_metadata(file, &scrub); + if (error) + return error; + + if (copy_to_user(arg, &scrub, sizeof(scrub))) + return -EFAULT; + + return 0; +} + +/* Decide if there have been any scrub failures up to this point. */ +static inline int +xfs_scrubv_check_barrier( + struct xfs_mount *mp, + const struct xfs_scrub_vec *vectors, + const struct xfs_scrub_vec *stop_vec) +{ + const struct xfs_scrub_vec *v; + __u32 failmask; + + failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT; + + for (v = vectors; v < stop_vec; v++) { + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) + continue; + + /* + * Runtime errors count as a previous failure, except the ones + * used to ask userspace to retry. + */ + switch (v->sv_ret) { + case -EBUSY: + case -ENOENT: + case -EUSERS: + case 0: + break; + default: + return -ECANCELED; + } + + /* + * If any of the out-flags on the scrub vector match the mask + * that was set on the barrier vector, that's a previous fail. + */ + if (v->sv_flags & failmask) + return -ECANCELED; + } + + return 0; +} + +/* + * If the caller provided us with a nonzero inode number that isn't the ioctl + * file, try to grab a reference to it to eliminate all further untrusted inode + * lookups. If we can't get the inode, let each scrub function try again. + */ +STATIC struct xfs_inode * +xchk_scrubv_open_by_handle( + struct xfs_mount *mp, + const struct xfs_scrub_vec_head *head) +{ + struct xfs_trans *tp; + struct xfs_inode *ip; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return NULL; + + error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip); + xfs_trans_cancel(tp); + if (error) + return NULL; + + if (VFS_I(ip)->i_generation != head->svh_gen) { + xfs_irele(ip); + return NULL; + } + + return ip; +} + +/* Vectored scrub implementation to reduce ioctl calls. */ +int +xfs_ioc_scrubv_metadata( + struct file *file, + void __user *arg) +{ + struct xfs_scrub_vec_head head; + struct xfs_scrub_vec_head __user *uhead = arg; + struct xfs_scrub_vec *vectors; + struct xfs_scrub_vec __user *uvectors; + struct xfs_inode *ip_in = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip_in->i_mount; + struct xfs_inode *handle_ip = NULL; + struct xfs_scrub_vec *v; + size_t vec_bytes; + unsigned int i; + int error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&head, uhead, sizeof(head))) + return -EFAULT; + + if (head.svh_reserved) + return -EINVAL; + if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL) + return -EINVAL; + if (head.svh_nr == 0) + return 0; + + vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec)); + if (vec_bytes > PAGE_SIZE) + return -ENOMEM; + + uvectors = (void __user *)(uintptr_t)head.svh_vectors; + vectors = memdup_user(uvectors, vec_bytes); + if (IS_ERR(vectors)) + return PTR_ERR(vectors); + + trace_xchk_scrubv_start(ip_in, &head); + + for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { + if (v->sv_reserved) { + error = -EINVAL; + goto out_free; + } + + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER && + (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) { + error = -EINVAL; + goto out_free; + } + + trace_xchk_scrubv_item(mp, &head, i, v); + } + + /* + * If the caller wants us to do a scrub-by-handle and the file used to + * call the ioctl is not the same file, load the incore inode and pin + * it across all the scrubv actions to avoid repeated UNTRUSTED + * lookups. The reference is not passed to deeper layers of scrub + * because each scrubber gets to decide its own strategy and return + * values for getting an inode. + */ + if (head.svh_ino && head.svh_ino != ip_in->i_ino) + handle_ip = xchk_scrubv_open_by_handle(mp, &head); + + /* Run all the scrubbers. */ + for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { + struct xfs_scrub_metadata sm = { + .sm_type = v->sv_type, + .sm_flags = v->sv_flags, + .sm_ino = head.svh_ino, + .sm_gen = head.svh_gen, + .sm_agno = head.svh_agno, + }; + + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) { + v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v); + if (v->sv_ret) { + trace_xchk_scrubv_barrier_fail(mp, &head, i, v); + break; + } + + continue; + } + + v->sv_ret = xfs_scrub_metadata(file, &sm); + v->sv_flags = sm.sm_flags; + + trace_xchk_scrubv_outcome(mp, &head, i, v); + + if (head.svh_rest_us) { + ktime_t expires; + + expires = ktime_add_ns(ktime_get(), + head.svh_rest_us * 1000); + set_current_state(TASK_KILLABLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + + if (fatal_signal_pending(current)) { + error = -EINTR; + goto out_free; + } + } + + if (copy_to_user(uvectors, vectors, vec_bytes) || + copy_to_user(uhead, &head, sizeof(head))) { + error = -EFAULT; + goto out_free; + } + +out_free: + if (handle_ip) + xfs_irele(handle_ip); + kfree(vectors); + return error; +} diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 9ad65b604fe1..1bc33f010d0e 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -8,6 +8,49 @@ struct xfs_scrub; +struct xchk_relax { + unsigned long next_resched; + unsigned int resched_nr; + bool interruptible; +}; + +/* Yield to the scheduler at most 10x per second. */ +#define XCHK_RELAX_NEXT (jiffies + (HZ / 10)) + +#define INIT_XCHK_RELAX \ + (struct xchk_relax){ \ + .next_resched = XCHK_RELAX_NEXT, \ + .resched_nr = 0, \ + .interruptible = true, \ + } + +/* + * Relax during a scrub operation and exit if there's a fatal signal pending. + * + * If preemption is disabled, we need to yield to the scheduler every now and + * then so that we don't run afoul of the soft lockup watchdog or RCU stall + * detector. cond_resched calls are somewhat expensive (~5ns) so we want to + * ratelimit this to 10x per second. Amortize the cost of the other checks by + * only doing it once every 100 calls. + */ +static inline int xchk_maybe_relax(struct xchk_relax *widget) +{ + /* Amortize the cost of scheduling and checking signals. */ + if (likely(++widget->resched_nr < 100)) + return 0; + widget->resched_nr = 0; + + if (unlikely(widget->next_resched <= jiffies)) { + cond_resched(); + widget->next_resched = XCHK_RELAX_NEXT; + } + + if (widget->interruptible && fatal_signal_pending(current)) + return -EINTR; + + return 0; +} + /* * Standard flags for allocating memory within scrub. NOFS context is * configured by the process allocation scope. Scrub and repair must be able @@ -17,6 +60,13 @@ struct xfs_scrub; #define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ __GFP_RETRY_MAYFAIL)) +/* + * For opening files by handle for fsck operations, we don't trust the inumber + * or the allocation state; therefore, perform an untrusted lookup. We don't + * want these inodes to pollute the cache, so mark them for immediate removal. + */ +#define XCHK_IGET_FLAGS (XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE) + /* Type info and names for the scrub types. */ enum xchk_type { ST_NONE = 1, /* disabled */ @@ -105,6 +155,14 @@ struct xfs_scrub { /* Lock flags for @ip. */ uint ilock_flags; + /* The orphanage, for stashing files that have lost their parent. */ + uint orphanage_ilock_flags; + struct xfs_inode *orphanage; + + /* A temporary file on this filesystem, for staging new metadata. */ + struct xfs_inode *tempip; + uint temp_ilock_flags; + /* See the XCHK/XREP state flags below. */ unsigned int flags; @@ -115,6 +173,9 @@ struct xfs_scrub { */ unsigned int sick_mask; + /* next time we want to cond_resched() */ + struct xchk_relax relax; + /* State tracking for single-AG operations. */ struct xchk_ag sa; }; @@ -141,6 +202,35 @@ struct xfs_scrub { XCHK_FSGATES_DIRENTS | \ XCHK_FSGATES_RMAP) +struct xfs_scrub_subord { + struct xfs_scrub sc; + struct xfs_scrub *parent_sc; + unsigned int old_smtype; + unsigned int old_smflags; +}; + +struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc, + unsigned int subtype); +void xchk_scrub_free_subord(struct xfs_scrub_subord *sub); + +/* + * We /could/ terminate a scrub/repair operation early. If we're not + * in a good place to continue (fatal signal, etc.) then bail out. + * Note that we're careful not to make any judgements about *error. + */ +static inline bool +xchk_should_terminate( + struct xfs_scrub *sc, + int *error) +{ + if (xchk_maybe_relax(&sc->relax)) { + if (*error == 0) + *error = -EINTR; + return true; + } + return false; +} + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); @@ -159,6 +249,7 @@ int xchk_directory(struct xfs_scrub *sc); int xchk_xattr(struct xfs_scrub *sc); int xchk_symlink(struct xfs_scrub *sc); int xchk_parent(struct xfs_scrub *sc); +int xchk_dirtree(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_rtbitmap(struct xfs_scrub *sc); int xchk_rtsummary(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index 42cafbed94ac..7996c2335476 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -79,6 +79,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", [XFS_SCRUB_TYPE_NLINKS] = "nlinks", + [XFS_SCRUB_TYPE_DIRTREE] = "dirtree", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index d77d8a9598f6..c848bcc07cd5 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -10,6 +10,7 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_symlink.h" #include "xfs_health.h" @@ -17,18 +18,28 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/health.h" +#include "scrub/repair.h" /* Set us up to scrub a symbolic link. */ int xchk_setup_symlink( struct xfs_scrub *sc) { + unsigned int resblks = 0; + int error; + /* Allocate the buffer without the inode lock held. */ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; - return xchk_setup_inode_contents(sc, 0); + if (xchk_could_repair(sc)) { + error = xrep_setup_symlink(sc, &resblks); + if (error) + return error; + } + + return xchk_setup_inode_contents(sc, resblks); } /* Symbolic links. */ diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c new file mode 100644 index 000000000000..d015a86ef460 --- /dev/null +++ b/fs/xfs/scrub/symlink_repair.c @@ -0,0 +1,509 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_symlink.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_symlink_remote.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/reap.h" + +/* + * Symbolic Link Repair + * ==================== + * + * We repair symbolic links by reading whatever target data we can find, up to + * the first NULL byte. If the recovered target strlen matches i_size, then + * we rewrite the target. In all other cases, we replace the target with an + * overly long string that cannot possibly resolve. The new target is written + * into a private hidden temporary file, and then a file contents exchange + * commits the new symlink target to the file being repaired. + */ + +/* Set us up to repair the symlink file. */ +int +xrep_setup_symlink( + struct xfs_scrub *sc, + unsigned int *resblks) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks; + int error; + + error = xrep_tempfile_create(sc, S_IFLNK); + if (error) + return error; + + /* + * If we're doing a repair, we reserve enough blocks to write out a + * completely new symlink file, plus twice as many blocks as we would + * need if we can only allocate one block per data fork mapping. This + * should cover the preallocation of the temporary file and exchanging + * the extent mappings. + * + * We cannot use xfs_exchmaps_estimate because we have not yet + * constructed the replacement symlink and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * symlink ILOCK) and cannot ask for more reservation. + */ + blocks = xfs_symlink_blocks(sc->mp, XFS_SYMLINK_MAXLEN); + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + *resblks += blocks; + return 0; +} + +/* + * Try to salvage the pathname from remote blocks. Returns the number of bytes + * salvaged or a negative errno. + */ +STATIC ssize_t +xrep_symlink_salvage_remote( + struct xfs_scrub *sc) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_inode *ip = sc->ip; + struct xfs_buf *bp; + char *target_buf = sc->buf; + xfs_failaddr_t fa; + xfs_filblks_t fsblocks; + xfs_daddr_t d; + loff_t len; + loff_t offset = 0; + unsigned int byte_cnt; + bool magic_ok; + bool hdr_ok; + int n; + int nmaps = XFS_SYMLINK_MAPS; + int error; + + /* We'll only read until the buffer is full. */ + len = min_t(loff_t, ip->i_disk_size, XFS_SYMLINK_MAXLEN); + fsblocks = xfs_symlink_blocks(sc->mp, len); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + return error; + + for (n = 0; n < nmaps; n++) { + struct xfs_dsymlink_hdr *dsl; + + d = XFS_FSB_TO_DADDR(sc->mp, mval[n].br_startblock); + + /* Read the rmt block. We'll run the verifiers manually. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, + d, XFS_FSB_TO_BB(sc->mp, mval[n].br_blockcount), + 0, &bp, NULL); + if (error) + return error; + bp->b_ops = &xfs_symlink_buf_ops; + + /* How many bytes do we expect to get out of this buffer? */ + byte_cnt = XFS_FSB_TO_B(sc->mp, mval[n].br_blockcount); + byte_cnt = XFS_SYMLINK_BUF_SPACE(sc->mp, byte_cnt); + byte_cnt = min_t(unsigned int, byte_cnt, len); + + /* + * See if the verifiers accept this block. We're willing to + * salvage if the if the offset/byte/ino are ok and either the + * verifier passed or the magic is ok. Anything else and we + * stop dead in our tracks. + */ + fa = bp->b_ops->verify_struct(bp); + dsl = bp->b_addr; + magic_ok = dsl->sl_magic == cpu_to_be32(XFS_SYMLINK_MAGIC); + hdr_ok = xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp); + if (!hdr_ok || (fa != NULL && !magic_ok)) + break; + + memcpy(target_buf + offset, dsl + 1, byte_cnt); + + len -= byte_cnt; + offset += byte_cnt; + } + return offset; +} + +/* + * Try to salvage an inline symlink's contents. Returns the number of bytes + * salvaged or a negative errno. + */ +STATIC ssize_t +xrep_symlink_salvage_inline( + struct xfs_scrub *sc) +{ + struct xfs_inode *ip = sc->ip; + char *target_buf = sc->buf; + char *old_target; + struct xfs_ifork *ifp; + unsigned int nr; + + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + if (!ifp->if_data) + return 0; + + /* + * If inode repair zapped the link target, pretend that we didn't find + * any bytes at all so that we can replace the (now totally lost) link + * target with a warning message. + */ + old_target = ifp->if_data; + if (xfs_inode_has_sickness(sc->ip, XFS_SICK_INO_SYMLINK_ZAPPED) && + sc->ip->i_disk_size == 1 && old_target[0] == '?') + return 0; + + nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); + strncpy(target_buf, ifp->if_data, nr); + return nr; +} + +#define DUMMY_TARGET \ + "The target of this symbolic link could not be recovered at all and " \ + "has been replaced with this explanatory message. To avoid " \ + "accidentally pointing to an existing file path, this message is " \ + "longer than the maximum supported file name length. That is an " \ + "acceptable length for a symlink target on XFS but will produce " \ + "File Name Too Long errors if resolved." + +/* Salvage whatever we can of the target. */ +STATIC int +xrep_symlink_salvage( + struct xfs_scrub *sc) +{ + char *target_buf = sc->buf; + ssize_t buflen = 0; + + BUILD_BUG_ON(sizeof(DUMMY_TARGET) - 1 <= NAME_MAX); + + /* + * Salvage the target if there weren't any corruption problems observed + * while scanning it. + */ + if (!(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + if (sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) + buflen = xrep_symlink_salvage_inline(sc); + else + buflen = xrep_symlink_salvage_remote(sc); + if (buflen < 0) + return buflen; + + /* + * NULL-terminate the buffer because the ondisk target does not + * do that for us. If salvage didn't find the exact amount of + * data that we expected to find, don't salvage anything. + */ + target_buf[buflen] = 0; + if (strlen(target_buf) != sc->ip->i_disk_size) + buflen = 0; + } + + /* + * Change an empty target into a dummy target and clear the symlink + * target zapped flag. + */ + if (buflen == 0) { + sc->sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; + sprintf(target_buf, DUMMY_TARGET); + } + + trace_xrep_symlink_salvage_target(sc->ip, target_buf, + strlen(target_buf)); + return 0; +} + +STATIC void +xrep_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp, + void *priv) +{ + struct xfs_scrub *sc = priv; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + xfs_symlink_local_to_remote(tp, bp, ip, ifp, NULL); + + if (!xfs_has_crc(sc->mp)) + return; + + dsl->sl_owner = cpu_to_be64(sc->ip->i_ino); + xfs_trans_log_buf(tp, bp, 0, + sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); +} + +/* + * Prepare both links' data forks for an exchange. Promote the tempfile from + * local format to extents format, and if the file being repaired has a short + * format data fork, turn it into an empty extent list. + */ +STATIC int +xrep_symlink_swap_prep( + struct xfs_scrub *sc, + bool temp_local, + bool ip_local) +{ + int error; + + /* + * If the temp link is in shortform format, convert that to a remote + * target so that we can use the atomic mapping exchange. + */ + if (temp_local) { + int logflags = XFS_ILOG_CORE; + + error = xfs_bmap_local_to_extents(sc->tp, sc->tempip, 1, + &logflags, XFS_DATA_FORK, + xrep_symlink_local_to_remote, + sc); + if (error) + return error; + + xfs_trans_log_inode(sc->tp, sc->ip, 0); + + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + /* + * If the file being repaired had a shortform data fork, convert that + * to an empty extent list in preparation for the atomic mapping + * exchange. + */ + if (ip_local) { + struct xfs_ifork *ifp; + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + xfs_idestroy_fork(ifp); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + ifp->if_bytes = 0; + ifp->if_data = NULL; + ifp->if_height = 0; + + xfs_trans_log_inode(sc->tp, sc->ip, + XFS_ILOG_CORE | XFS_ILOG_DDATA); + } + + return 0; +} + +/* Exchange the temporary symlink's data fork with the one being repaired. */ +STATIC int +xrep_symlink_swap( + struct xfs_scrub *sc) +{ + struct xrep_tempexch *tx = sc->buf; + bool ip_local, temp_local; + int error; + + ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; + + /* + * If the both links have a local format data fork and the rebuilt + * remote data would fit in the repaired file's data fork, copy the + * contents from the tempfile and declare ourselves done. + */ + if (ip_local && temp_local && + sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { + xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); + return 0; + } + + /* Otherwise, make sure both data forks are in block-mapping mode. */ + error = xrep_symlink_swap_prep(sc, temp_local, ip_local); + if (error) + return error; + + return xrep_tempexch_contents(sc, tx); +} + +/* + * Free all the remote blocks and reset the data fork. The caller must join + * the inode to the transaction. This function returns with the inode joined + * to a clean scrub transaction. + */ +STATIC int +xrep_symlink_reset_fork( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); + int error; + + /* Unmap all the remote target buffers. */ + if (xfs_ifork_has_extents(ifp)) { + error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); + if (error) + return error; + } + + trace_xrep_symlink_reset_fork(sc->tempip); + + /* Reset the temp symlink target to dummy content. */ + xfs_idestroy_fork(ifp); + return xfs_symlink_write_target(sc->tp, sc->tempip, sc->tempip->i_ino, + "?", 1, 0, 0); +} + +/* + * Reinitialize a link target. Caller must ensure the inode is joined to + * the transaction. + */ +STATIC int +xrep_symlink_rebuild( + struct xfs_scrub *sc) +{ + struct xrep_tempexch *tx; + char *target_buf = sc->buf; + xfs_fsblock_t fs_blocks; + unsigned int target_len; + unsigned int resblks; + int error; + + /* How many blocks do we need? */ + target_len = strlen(target_buf); + ASSERT(target_len != 0); + if (target_len == 0 || target_len > XFS_SYMLINK_MAXLEN) + return -EFSCORRUPTED; + + trace_xrep_symlink_rebuild(sc->ip); + + /* + * In preparation to write the new symlink target to the temporary + * file, drop the ILOCK of the file being repaired (it shouldn't be + * joined) and take the ILOCK of the temporary file. + * + * The VFS does not take the IOLOCK while reading a symlink (and new + * symlinks are hidden with INEW until they've been written) so it's + * possible that a readlink() could see the old corrupted contents + * while we're doing this. + */ + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xrep_tempfile_ilock(sc); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* + * Reserve resources to reinitialize the target. We're allowed to + * exceed file quota to repair inconsistent metadata, though this is + * unlikely. + */ + fs_blocks = xfs_symlink_blocks(sc->mp, target_len); + resblks = xfs_symlink_space_res(sc->mp, target_len, fs_blocks); + error = xfs_trans_reserve_quota_nblks(sc->tp, sc->tempip, resblks, 0, + true); + if (error) + return error; + + /* Erase the dummy target set up by the tempfile initialization. */ + xfs_idestroy_fork(&sc->tempip->i_df); + sc->tempip->i_df.if_bytes = 0; + sc->tempip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; + + /* Write the salvaged target to the temporary link. */ + error = xfs_symlink_write_target(sc->tp, sc->tempip, sc->ip->i_ino, + target_buf, target_len, fs_blocks, resblks); + if (error) + return error; + + /* + * Commit the repair transaction so that we can use the atomic mapping + * exchange functions to compute the correct block reservations and + * re-lock the inodes. + */ + target_buf = NULL; + error = xrep_trans_commit(sc); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + xrep_tempfile_iunlock(sc); + + /* + * We're done with the temporary buffer, so we can reuse it for the + * tempfile contents exchange information. + */ + tx = sc->buf; + error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, tx); + if (error) + return error; + + /* + * Exchange the temp link's data fork with the file being repaired. + * This recreates the transaction and takes the ILOCKs of the file + * being repaired and the temporary file. + */ + error = xrep_symlink_swap(sc); + if (error) + return error; + + /* + * Release the old symlink blocks and reset the data fork of the temp + * link to an empty shortform link. This is the last repair action we + * perform on the symlink, so we don't need to clean the transaction. + */ + return xrep_symlink_reset_fork(sc); +} + +/* Repair a symbolic link. */ +int +xrep_symlink( + struct xfs_scrub *sc) +{ + int error; + + /* The rmapbt is required to reap the old data fork. */ + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + /* We require atomic file exchange range to rebuild anything. */ + if (!xfs_has_exchange_range(sc->mp)) + return -EOPNOTSUPP; + + ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); + + error = xrep_symlink_salvage(sc); + if (error) + return error; + + /* Now reset the target. */ + error = xrep_symlink_rebuild(sc); + if (error) + return error; + + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/tempexch.h b/fs/xfs/scrub/tempexch.h new file mode 100644 index 000000000000..995ba187c5aa --- /dev/null +++ b/fs/xfs/scrub/tempexch.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_TEMPEXCH_H__ +#define __XFS_SCRUB_TEMPEXCH_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +struct xrep_tempexch { + struct xfs_exchmaps_req req; +}; + +int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork, + struct xrep_tempexch *ti); +int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork, + struct xrep_tempexch *ti); + +int xrep_tempexch_contents(struct xfs_scrub *sc, struct xrep_tempexch *ti); +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_TEMPEXCH_H__ */ diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c new file mode 100644 index 000000000000..b747b625c5ee --- /dev/null +++ b/fs/xfs/scrub/tempfile.c @@ -0,0 +1,851 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_dir2.h" +#include "xfs_exchrange.h" +#include "xfs_exchmaps.h" +#include "xfs_defer.h" +#include "xfs_symlink_remote.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/trace.h" +#include "scrub/tempfile.h" +#include "scrub/tempexch.h" +#include "scrub/xfile.h" + +/* + * Create a temporary file for reconstructing metadata, with the intention of + * atomically exchanging the temporary file's contents with the file that's + * being repaired. + */ +int +xrep_tempfile_create( + struct xfs_scrub *sc, + uint16_t mode) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = NULL; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + struct xfs_inode *dp = mp->m_rootip; + xfs_ino_t ino; + unsigned int resblks; + bool is_dir = S_ISDIR(mode); + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + if (xfs_is_readonly(mp)) + return -EROFS; + + ASSERT(sc->tp == NULL); + ASSERT(sc->tempip == NULL); + + /* + * Make sure that we have allocated dquot(s) on disk. The temporary + * inode should be completely root owned so that we don't fail due to + * quota limits. + */ + error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp); + if (error) + return error; + + if (is_dir) { + resblks = xfs_mkdir_space_res(mp, 0); + tres = &M_RES(mp)->tr_mkdir; + } else { + resblks = XFS_IALLOC_SPACE_RES(mp); + tres = &M_RES(mp)->tr_create_tmpfile; + } + + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, + &tp); + if (error) + goto out_release_dquots; + + /* Allocate inode, set up directory. */ + error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); + if (error) + goto out_trans_cancel; + error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0, + 0, false, &sc->tempip); + if (error) + goto out_trans_cancel; + + /* Change the ownership of the inode to root. */ + VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID; + VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID; + sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); + xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE); + + /* + * Mark our temporary file as private so that LSMs and the ACL code + * don't try to add their own metadata or reason about these files. + * The file should never be exposed to userspace. + */ + VFS_I(sc->tempip)->i_flags |= S_PRIVATE; + VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR; + + if (is_dir) { + error = xfs_dir_init(tp, sc->tempip, dp); + if (error) + goto out_trans_cancel; + } else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) { + /* + * Initialize the temporary symlink with a meaningless target + * that won't trip the verifiers. Repair must rewrite the + * target with meaningful content before swapping with the file + * being repaired. A single-byte target will not write a + * remote target block, so the owner is irrelevant. + */ + error = xfs_symlink_write_target(tp, sc->tempip, + sc->tempip->i_ino, ".", 1, 0, 0); + if (error) + goto out_trans_cancel; + } + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp); + + /* + * Put our temp file on the unlinked list so it's purged automatically. + * All file-based metadata being reconstructed using this file must be + * atomically exchanged with the original file because the contents + * here will be purged when the inode is dropped or log recovery cleans + * out the unlinked list. + */ + error = xfs_iunlink(tp, sc->tempip); + if (error) + goto out_trans_cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out_release_inode; + + trace_xrep_tempfile_create(sc); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + /* Finish setting up the incore / vfs context. */ + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + xfs_setup_iops(sc->tempip); + xfs_finish_inode_setup(sc->tempip); + + sc->temp_ilock_flags = 0; + return error; + +out_trans_cancel: + xfs_trans_cancel(tp); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (sc->tempip) { + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + xfs_finish_inode_setup(sc->tempip); + xchk_irele(sc, sc->tempip); + } +out_release_dquots: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; +} + +/* Take IOLOCK_EXCL on the temporary file, maybe. */ +bool +xrep_tempfile_iolock_nowait( + struct xfs_scrub *sc) +{ + if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) { + sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; + return true; + } + + return false; +} + +/* + * Take the temporary file's IOLOCK while holding a different inode's IOLOCK. + * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock + * to avoid deadlocks and lockdep complaints. + */ +int +xrep_tempfile_iolock_polled( + struct xfs_scrub *sc) +{ + int error = 0; + + while (!xrep_tempfile_iolock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + return 0; +} + +/* Release IOLOCK_EXCL on the temporary file. */ +void +xrep_tempfile_iounlock( + struct xfs_scrub *sc) +{ + xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL); + sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL; +} + +/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */ +void +xrep_tempfile_ilock( + struct xfs_scrub *sc) +{ + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->tempip, XFS_ILOCK_EXCL); +} + +/* Try to grab ILOCK_EXCL on the temporary file. */ +bool +xrep_tempfile_ilock_nowait( + struct xfs_scrub *sc) +{ + if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) { + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + return true; + } + + return false; +} + +/* Unlock ILOCK_EXCL on the temporary file after an update. */ +void +xrep_tempfile_iunlock( + struct xfs_scrub *sc) +{ + xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); + sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL; +} + +/* + * Begin the process of making changes to both the file being scrubbed and + * the temporary file by taking ILOCK_EXCL on both. + */ +void +xrep_tempfile_ilock_both( + struct xfs_scrub *sc) +{ + xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL); + sc->ilock_flags |= XFS_ILOCK_EXCL; + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; +} + +/* Unlock ILOCK_EXCL on both files. */ +void +xrep_tempfile_iunlock_both( + struct xfs_scrub *sc) +{ + xrep_tempfile_iunlock(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL); +} + +/* Release the temporary file. */ +void +xrep_tempfile_rele( + struct xfs_scrub *sc) +{ + if (!sc->tempip) + return; + + if (sc->temp_ilock_flags) { + xfs_iunlock(sc->tempip, sc->temp_ilock_flags); + sc->temp_ilock_flags = 0; + } + + xchk_irele(sc, sc->tempip); + sc->tempip = NULL; +} + +/* + * Make sure that the given range of the data fork of the temporary file is + * mapped to written blocks. The caller must ensure that both inodes are + * joined to the transaction. + */ +int +xrep_tempfile_prealloc( + struct xfs_scrub *sc, + xfs_fileoff_t off, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t end = off + len; + int error; + + ASSERT(sc->tempip != NULL); + ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip)); + + for (; off < end; off = map.br_startoff + map.br_blockcount) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + if (xfs_bmap_is_written_extent(&map)) + continue; + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* + * Make sure this block has a real zeroed extent allocated to + * it. + */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map, + &nmaps); + if (error) + return error; + if (nmaps != 1) + return -EFSCORRUPTED; + + trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map); + + /* Commit new extent and all deferred work. */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + return 0; +} + +/* + * Write data to each block of a file. The given range of the tempfile's data + * fork must already be populated with written extents. + */ +int +xrep_tempfile_copyin( + struct xfs_scrub *sc, + xfs_fileoff_t off, + xfs_filblks_t len, + xrep_tempfile_copyin_fn prep_fn, + void *data) +{ + LIST_HEAD(buffers_list); + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + xfs_fileoff_t flush_mask; + xfs_fileoff_t end = off + len; + loff_t pos = XFS_FSB_TO_B(mp, off); + int error = 0; + + ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode)); + + /* Flush buffers to disk every 512K */ + flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1; + + for (; off < end; off++, pos += mp->m_sb.sb_blocksize) { + struct xfs_bmbt_irec map; + int nmaps = 1; + + /* Read block mapping for this file block. */ + error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0); + if (error) + goto out_err; + if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) { + error = -EFSCORRUPTED; + goto out_err; + } + + /* Get the metadata buffer for this offset in the file. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), + mp->m_bsize, 0, &bp); + if (error) + goto out_err; + + trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map); + + /* Read in a block's worth of data from the xfile. */ + error = prep_fn(sc, bp, data); + if (error) { + xfs_trans_brelse(sc->tp, bp); + goto out_err; + } + + /* Queue buffer, and flush if we have too much dirty data. */ + xfs_buf_delwri_queue_here(bp, &buffers_list); + xfs_trans_brelse(sc->tp, bp); + + if (!(off & flush_mask)) { + error = xfs_buf_delwri_submit(&buffers_list); + if (error) + goto out_err; + } + } + + /* + * Write the new blocks to disk. If the ordered list isn't empty after + * that, then something went wrong and we have to fail. This should + * never happen, but we'll check anyway. + */ + error = xfs_buf_delwri_submit(&buffers_list); + if (error) + goto out_err; + + if (!list_empty(&buffers_list)) { + ASSERT(list_empty(&buffers_list)); + error = -EIO; + goto out_err; + } + + return 0; + +out_err: + xfs_buf_delwri_cancel(&buffers_list); + return error; +} + +/* + * Set the temporary file's size. Caller must join the tempfile to the scrub + * transaction and is responsible for adjusting block mappings as needed. + */ +int +xrep_tempfile_set_isize( + struct xfs_scrub *sc, + unsigned long long isize) +{ + if (sc->tempip->i_disk_size == isize) + return 0; + + sc->tempip->i_disk_size = isize; + i_size_write(VFS_I(sc->tempip), isize); + return xrep_tempfile_roll_trans(sc); +} + +/* + * Roll a repair transaction involving the temporary file. Caller must join + * both the temporary file and the file being scrubbed to the transaction. + * This function return with both inodes joined to a new scrub transaction, + * or the usual negative errno. + */ +int +xrep_tempfile_roll_trans( + struct xfs_scrub *sc) +{ + int error; + + xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE); + error = xrep_roll_trans(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + return 0; +} + +/* + * Fill out the mapping exchange request in preparation for atomically + * committing the contents of a metadata file that we've rebuilt in the temp + * file. + */ +STATIC int +xrep_tempexch_prep_request( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempexch *tx) +{ + struct xfs_exchmaps_req *req = &tx->req; + + memset(tx, 0, sizeof(struct xrep_tempexch)); + + /* COW forks don't exist on disk. */ + if (whichfork == XFS_COW_FORK) { + ASSERT(0); + return -EINVAL; + } + + /* Both files should have the relevant forks. */ + if (!xfs_ifork_ptr(sc->ip, whichfork) || + !xfs_ifork_ptr(sc->tempip, whichfork)) { + ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); + ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL); + return -EINVAL; + } + + /* Exchange all mappings in both forks. */ + req->ip1 = sc->tempip; + req->ip2 = sc->ip; + req->startoff1 = 0; + req->startoff2 = 0; + switch (whichfork) { + case XFS_ATTR_FORK: + req->flags |= XFS_EXCHMAPS_ATTR_FORK; + break; + case XFS_DATA_FORK: + /* Always exchange sizes when exchanging data fork mappings. */ + req->flags |= XFS_EXCHMAPS_SET_SIZES; + break; + } + req->blockcount = XFS_MAX_FILEOFF; + + return 0; +} + +/* + * Fill out the mapping exchange resource estimation structures in preparation + * for exchanging the contents of a metadata file that we've rebuilt in the + * temp file. Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files. + */ +STATIC int +xrep_tempexch_estimate( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + struct xfs_exchmaps_req *req = &tx->req; + struct xfs_ifork *ifp; + struct xfs_ifork *tifp; + int whichfork = xfs_exchmaps_reqfork(req); + int state = 0; + + /* + * The exchmaps code only knows how to exchange file fork space + * mappings. Any fork data in local format must be promoted to a + * single block before the exchange can take place. + */ + ifp = xfs_ifork_ptr(sc->ip, whichfork); + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) + state |= 1; + + tifp = xfs_ifork_ptr(sc->tempip, whichfork); + if (tifp->if_format == XFS_DINODE_FMT_LOCAL) + state |= 2; + + switch (state) { + case 0: + /* Both files have mapped extents; use the regular estimate. */ + return xfs_exchrange_estimate(req); + case 1: + /* + * The file being repaired is in local format, but the temp + * file has mapped extents. To perform the exchange, the file + * being repaired must have its shorform data converted to an + * ondisk block so that the forks will be in extents format. + * We need one resblk for the conversion; the number of + * exchanges is (worst case) the temporary file's extent count + * plus the block we converted. + */ + req->ip1_bcount = sc->tempip->i_nblocks; + req->ip2_bcount = 1; + req->nr_exchanges = 1 + tifp->if_nextents; + req->resblks = 1; + break; + case 2: + /* + * The temporary file is in local format, but the file being + * repaired has mapped extents. To perform the exchange, the + * temp file must have its shortform data converted to an + * ondisk block, and the fork changed to extents format. We + * need one resblk for the conversion; the number of exchanges + * is (worst case) the extent count of the file being repaired + * plus the block we converted. + */ + req->ip1_bcount = 1; + req->ip2_bcount = sc->ip->i_nblocks; + req->nr_exchanges = 1 + ifp->if_nextents; + req->resblks = 1; + break; + case 3: + /* + * Both forks are in local format. To perform the exchange, + * both files must have their shortform data converted to + * fsblocks, and both forks must be converted to extents + * format. We need two resblks for the two conversions, and + * the number of exchanges is 1 since there's only one block at + * fileoff 0. Presumably, the caller could not exchange the + * two inode fork areas directly. + */ + req->ip1_bcount = 1; + req->ip2_bcount = 1; + req->nr_exchanges = 1; + req->resblks = 2; + break; + } + + return xfs_exchmaps_estimate_overhead(req); +} + +/* + * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip + * this if quota enforcement is disabled or if both inodes' dquots are the + * same. The qretry structure must be initialized to zeroes before the first + * call to this function. + */ +STATIC int +xrep_tempexch_reserve_quota( + struct xfs_scrub *sc, + const struct xrep_tempexch *tx) +{ + struct xfs_trans *tp = sc->tp; + const struct xfs_exchmaps_req *req = &tx->req; + int64_t ddelta, rdelta; + int error; + + /* + * Don't bother with a quota reservation if we're not enforcing them + * or the two inodes have the same dquots. + */ + if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || + (req->ip1->i_udquot == req->ip2->i_udquot && + req->ip1->i_gdquot == req->ip2->i_gdquot && + req->ip1->i_pdquot == req->ip2->i_pdquot)) + return 0; + + /* + * Quota reservation for each file comes from two sources. First, we + * need to account for any net gain in mapped blocks during the + * exchange. Second, we need reservation for the gross gain in mapped + * blocks so that we don't trip over any quota block reservation + * assertions. We must reserve the gross gain because the quota code + * subtracts from bcount the number of blocks that we unmap; it does + * not add that quantity back to the quota block reservation. + */ + ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount); + rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount); + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, + ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount, + true); + if (error) + return error; + + ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount); + rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount); + return xfs_trans_reserve_quota_nblks(tp, req->ip2, + ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount, + true); +} + +/* + * Prepare an existing transaction for an atomic file contents exchange. + * + * This function fills out the mapping exchange request and resource estimation + * structures in preparation for exchanging the contents of a metadata file + * that has been rebuilt in the temp file. Next, it reserves space and quota + * for the transaction. + * + * The caller must hold ILOCK_EXCL of the scrub target file and the temporary + * file. The caller must join both inodes to the transaction with no unlock + * flags, and is responsible for dropping both ILOCKs when appropriate. Only + * use this when those ILOCKs cannot be dropped. + */ +int +xrep_tempexch_trans_reserve( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempexch *tx) +{ + int error; + + ASSERT(sc->tp != NULL); + xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL); + + error = xrep_tempexch_prep_request(sc, whichfork, tx); + if (error) + return error; + + error = xfs_exchmaps_estimate(&tx->req); + if (error) + return error; + + error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0); + if (error) + return error; + + return xrep_tempexch_reserve_quota(sc, tx); +} + +/* + * Create a new transaction for a file contents exchange. + * + * This function fills out the mapping excahange request and resource + * estimation structures in preparation for exchanging the contents of a + * metadata file that has been rebuilt in the temp file. Next, it reserves + * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and + * reserves quota for the transaction. + * + * The caller is responsible for dropping both ILOCKs when appropriate. + */ +int +xrep_tempexch_trans_alloc( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempexch *tx) +{ + unsigned int flags = 0; + int error; + + ASSERT(sc->tp == NULL); + ASSERT(xfs_has_exchange_range(sc->mp)); + + error = xrep_tempexch_prep_request(sc, whichfork, tx); + if (error) + return error; + + error = xrep_tempexch_estimate(sc, tx); + if (error) + return error; + + if (xfs_has_lazysbcount(sc->mp)) + flags |= XFS_TRANS_RES_FDBLKS; + + error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, + tx->req.resblks, 0, flags, &sc->tp); + if (error) + return error; + + sc->temp_ilock_flags |= XFS_ILOCK_EXCL; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip); + + return xrep_tempexch_reserve_quota(sc, tx); +} + +/* + * Exchange file mappings (and hence file contents) between the file being + * repaired and the temporary file. Returns with both inodes locked and joined + * to a clean scrub transaction. + */ +int +xrep_tempexch_contents( + struct xfs_scrub *sc, + struct xrep_tempexch *tx) +{ + int error; + + ASSERT(xfs_has_exchange_range(sc->mp)); + + xfs_exchange_mappings(sc->tp, &tx->req); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* + * If we exchanged the ondisk sizes of two metadata files, we must + * exchanged the incore sizes as well. + */ + if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) { + loff_t temp; + + temp = i_size_read(VFS_I(sc->ip)); + i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); + i_size_write(VFS_I(sc->tempip), temp); + } + + return 0; +} + +/* + * Write local format data from one of the temporary file's forks into the same + * fork of file being repaired, and exchange the file sizes, if appropriate. + * Caller must ensure that the file being repaired has enough fork space to + * hold all the bytes. + */ +void +xrep_tempfile_copyout_local( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_ifork *temp_ifp; + struct xfs_ifork *ifp; + unsigned int ilog_flags = XFS_ILOG_CORE; + + temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork); + ifp = xfs_ifork_ptr(sc->ip, whichfork); + + ASSERT(temp_ifp != NULL); + ASSERT(ifp != NULL); + ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + + switch (whichfork) { + case XFS_DATA_FORK: + ASSERT(sc->tempip->i_disk_size <= + xfs_inode_data_fork_size(sc->ip)); + break; + case XFS_ATTR_FORK: + ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff); + break; + default: + ASSERT(0); + return; + } + + /* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */ + xfs_idestroy_fork(ifp); + xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data, + temp_ifp->if_bytes); + + if (whichfork == XFS_DATA_FORK) { + i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); + sc->ip->i_disk_size = sc->tempip->i_disk_size; + } + + ilog_flags |= xfs_ilog_fdata(whichfork); + xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags); +} + +/* Decide if a given XFS inode is a temporary file for a repair. */ +bool +xrep_is_tempfile( + const struct xfs_inode *ip) +{ + const struct inode *inode = &ip->i_vnode; + + if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR)) + return true; + + return false; +} diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h new file mode 100644 index 000000000000..e51399f595fe --- /dev/null +++ b/fs/xfs/scrub/tempfile.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_TEMPFILE_H__ +#define __XFS_SCRUB_TEMPFILE_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode); +void xrep_tempfile_rele(struct xfs_scrub *sc); + +bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc); +int xrep_tempfile_iolock_polled(struct xfs_scrub *sc); +void xrep_tempfile_iounlock(struct xfs_scrub *sc); + +void xrep_tempfile_ilock(struct xfs_scrub *sc); +bool xrep_tempfile_ilock_nowait(struct xfs_scrub *sc); +void xrep_tempfile_iunlock(struct xfs_scrub *sc); +void xrep_tempfile_iunlock_both(struct xfs_scrub *sc); +void xrep_tempfile_ilock_both(struct xfs_scrub *sc); + +int xrep_tempfile_prealloc(struct xfs_scrub *sc, xfs_fileoff_t off, + xfs_filblks_t len); + +enum xfs_blft; + +typedef int (*xrep_tempfile_copyin_fn)(struct xfs_scrub *sc, + struct xfs_buf *bp, void *data); + +int xrep_tempfile_copyin(struct xfs_scrub *sc, xfs_fileoff_t off, + xfs_filblks_t len, xrep_tempfile_copyin_fn fn, void *data); + +int xrep_tempfile_set_isize(struct xfs_scrub *sc, unsigned long long isize); + +int xrep_tempfile_roll_trans(struct xfs_scrub *sc); +void xrep_tempfile_copyout_local(struct xfs_scrub *sc, int whichfork); +bool xrep_is_tempfile(const struct xfs_inode *ip); +#else +static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc) +{ + xchk_ilock(sc, XFS_IOLOCK_EXCL); +} +# define xrep_is_tempfile(ip) (false) +# define xrep_tempfile_rele(sc) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_TEMPFILE_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 3dd281d6d185..4470ad0533b8 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -19,13 +19,19 @@ #include "xfs_da_format.h" #include "xfs_dir2.h" #include "xfs_rmap.h" +#include "xfs_parent.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/quota.h" #include "scrub/iscan.h" +#include "scrub/orphanage.h" #include "scrub/nlinks.h" #include "scrub/fscounters.h" +#include "scrub/bitmap.h" +#include "scrub/ino_bitmap.h" +#include "scrub/xfblob.h" +#include "scrub/dirtree.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 5b294be52c55..e27daa51cab6 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -26,6 +26,10 @@ struct xchk_iscan; struct xchk_nlink; struct xchk_fscounters; struct xfs_rmap_update_params; +struct xfs_parent_rec; +enum xchk_dirpath_outcome; +struct xchk_dirtree; +struct xchk_dirtree_outcomes; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -64,6 +68,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -93,7 +99,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \ { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \ { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ - { XFS_SCRUB_TYPE_HEALTHY, "healthy" } + { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \ + { XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \ + { XFS_SCRUB_TYPE_BARRIER, "barrier" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -169,6 +177,8 @@ DEFINE_EVENT(xchk_class, name, \ DEFINE_SCRUB_EVENT(xchk_start); DEFINE_SCRUB_EVENT(xchk_done); DEFINE_SCRUB_EVENT(xchk_deadlock_retry); +DEFINE_SCRUB_EVENT(xchk_dirtree_start); +DEFINE_SCRUB_EVENT(xchk_dirtree_done); DEFINE_SCRUB_EVENT(xrep_attempt); DEFINE_SCRUB_EVENT(xrep_done); @@ -199,6 +209,81 @@ DEFINE_EVENT(xchk_fsgate_class, name, \ DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable); DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable); +DECLARE_EVENT_CLASS(xchk_vector_head_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), + TP_ARGS(ip, vhead), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(unsigned short, rest_us) + __field(unsigned short, nr_vecs) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = vhead->svh_agno; + __entry->inum = vhead->svh_ino; + __entry->gen = vhead->svh_gen; + __entry->flags = vhead->svh_flags; + __entry->rest_us = vhead->svh_rest_us; + __entry->nr_vecs = vhead->svh_nr; + ), + TP_printk("dev %d:%d ino 0x%llx agno 0x%x inum 0x%llx gen 0x%x flags 0x%x rest_us %u nr_vecs %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->inum, + __entry->gen, + __entry->flags, + __entry->rest_us, + __entry->nr_vecs) +) +#define DEFINE_SCRUBV_HEAD_EVENT(name) \ +DEFINE_EVENT(xchk_vector_head_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), \ + TP_ARGS(ip, vhead)) + +DEFINE_SCRUBV_HEAD_EVENT(xchk_scrubv_start); + +DECLARE_EVENT_CLASS(xchk_vector_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, + unsigned int vec_nr, struct xfs_scrub_vec *v), + TP_ARGS(mp, vhead, vec_nr, v), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, vec_nr) + __field(unsigned int, vec_type) + __field(unsigned int, vec_flags) + __field(int, vec_ret) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->vec_nr = vec_nr; + __entry->vec_type = v->sv_type; + __entry->vec_flags = v->sv_flags; + __entry->vec_ret = v->sv_ret; + ), + TP_printk("dev %d:%d vec[%u] type %s flags %s ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->vec_nr, + __print_symbolic(__entry->vec_type, XFS_SCRUB_TYPE_STRINGS), + __print_flags(__entry->vec_flags, "|", XFS_SCRUB_FLAG_STRINGS), + __entry->vec_ret) +) +#define DEFINE_SCRUBV_EVENT(name) \ +DEFINE_EVENT(xchk_vector_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, \ + unsigned int vec_nr, struct xfs_scrub_vec *v), \ + TP_ARGS(mp, vhead, vec_nr, v)) + +DEFINE_SCRUBV_EVENT(xchk_scrubv_barrier_fail); +DEFINE_SCRUBV_EVENT(xchk_scrubv_item); +DEFINE_SCRUBV_EVENT(xchk_scrubv_outcome); + TRACE_EVENT(xchk_op_error, TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int error, void *ret_ip), @@ -364,6 +449,7 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \ DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error); DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning); +DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_preen); #ifdef CONFIG_XFS_QUOTA DECLARE_EVENT_CLASS(xchk_dqiter_class, @@ -947,6 +1033,7 @@ DEFINE_XFILE_EVENT(xfile_store); DEFINE_XFILE_EVENT(xfile_seek_data); DEFINE_XFILE_EVENT(xfile_get_folio); DEFINE_XFILE_EVENT(xfile_put_folio); +DEFINE_XFILE_EVENT(xfile_discard); TRACE_EVENT(xfarray_create, TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), @@ -1300,7 +1387,7 @@ TRACE_EVENT(xchk_iscan_iget_batch, __entry->unavail) ); -TRACE_EVENT(xchk_iscan_iget_retry_wait, +DECLARE_EVENT_CLASS(xchk_iscan_retry_wait_class, TP_PROTO(struct xchk_iscan *iscan), TP_ARGS(iscan), TP_STRUCT__entry( @@ -1326,7 +1413,13 @@ TRACE_EVENT(xchk_iscan_iget_retry_wait, __entry->remaining, __entry->iget_timeout, __entry->retry_delay) -); +) +#define DEFINE_ISCAN_RETRY_WAIT_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_retry_wait_class, name, \ + TP_PROTO(struct xchk_iscan *iscan), \ + TP_ARGS(iscan)) +DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_iget_retry_wait); +DEFINE_ISCAN_RETRY_WAIT_EVENT(xchk_iscan_agi_retry_wait); TRACE_EVENT(xchk_nlinks_collect_dirent, TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, @@ -1354,6 +1447,33 @@ TRACE_EVENT(xchk_nlinks_collect_dirent, __get_str(name)) ); +TRACE_EVENT(xchk_nlinks_collect_pptr, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(mp, dp, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->ino = be64_to_cpu(pptr->p_ino); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + TRACE_EVENT(xchk_nlinks_collect_metafile, TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino), TP_ARGS(mp, ino), @@ -1502,6 +1622,300 @@ DEFINE_EVENT(xchk_nlinks_diff_class, name, \ TP_ARGS(mp, ip, live)) DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode); +DECLARE_EVENT_CLASS(xchk_pptr_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, + xfs_ino_t far_ino), + TP_ARGS(ip, name, far_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + __field(xfs_ino_t, far_ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name, name->len); + __entry->far_ino = far_ino; + ), + TP_printk("dev %d:%d ino 0x%llx name '%.*s' far_ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __get_str(name), + __entry->far_ino) +) +#define DEFINE_XCHK_PPTR_EVENT(name) \ +DEFINE_EVENT(xchk_pptr_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \ + xfs_ino_t far_ino), \ + TP_ARGS(ip, name, far_ino)) +DEFINE_XCHK_PPTR_EVENT(xchk_dir_defer); +DEFINE_XCHK_PPTR_EVENT(xchk_dir_slowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_dir_ultraslowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_defer); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_slowpath); +DEFINE_XCHK_PPTR_EVENT(xchk_parent_ultraslowpath); + +DECLARE_EVENT_CLASS(xchk_dirtree_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int path_nr, const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(sc, ip, path_nr, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(xfs_ino_t, child_ino) + __field(unsigned int, child_gen) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->child_ino = ip->i_ino; + __entry->child_gen = VFS_I(ip)->i_generation; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d path %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->child_ino, + __entry->child_gen, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_XCHK_DIRTREE_EVENT(name) \ +DEFINE_EVENT(xchk_dirtree_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \ + unsigned int path_nr, const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(sc, ip, path_nr, name, pptr)) +DEFINE_XCHK_DIRTREE_EVENT(xchk_dirtree_create_path); +DEFINE_XCHK_DIRTREE_EVENT(xchk_dirpath_walk_upwards); + +DECLARE_EVENT_CLASS(xchk_dirpath_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, + unsigned int path_nr, unsigned int step_nr, + const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(sc, ip, path_nr, step_nr, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(unsigned int, step_nr) + __field(xfs_ino_t, child_ino) + __field(unsigned int, child_gen) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->step_nr = step_nr; + __entry->child_ino = ip->i_ino; + __entry->child_gen = VFS_I(ip)->i_generation; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d path %u step %u child_ino 0x%llx child_gen 0x%x parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->step_nr, + __entry->child_ino, + __entry->child_gen, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_XCHK_DIRPATH_EVENT(name) \ +DEFINE_EVENT(xchk_dirpath_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, \ + unsigned int path_nr, unsigned int step_nr, \ + const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(sc, ip, path_nr, step_nr, name, pptr)) +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_disappeared); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step); + +TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_CORRUPT); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_LOOP); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_STALE); +TRACE_DEFINE_ENUM(XCHK_DIRPATH_OK); +TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETING); +TRACE_DEFINE_ENUM(XREP_DIRPATH_DELETED); +TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTING); +TRACE_DEFINE_ENUM(XREP_DIRPATH_ADOPTED); + +#define XCHK_DIRPATH_OUTCOME_STRINGS \ + { XCHK_DIRPATH_SCANNING, "scanning" }, \ + { XCHK_DIRPATH_DELETE, "delete" }, \ + { XCHK_DIRPATH_CORRUPT, "corrupt" }, \ + { XCHK_DIRPATH_LOOP, "loop" }, \ + { XCHK_DIRPATH_STALE, "stale" }, \ + { XCHK_DIRPATH_OK, "ok" }, \ + { XREP_DIRPATH_DELETING, "deleting" }, \ + { XREP_DIRPATH_DELETED, "deleted" }, \ + { XREP_DIRPATH_ADOPTING, "adopting" }, \ + { XREP_DIRPATH_ADOPTED, "adopted" } + +DECLARE_EVENT_CLASS(xchk_dirpath_outcome_class, + TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, + unsigned int nr_steps, \ + unsigned int outcome), + TP_ARGS(sc, path_nr, nr_steps, outcome), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, path_nr) + __field(unsigned int, nr_steps) + __field(unsigned int, outcome) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->nr_steps = nr_steps; + __entry->outcome = outcome; + ), + TP_printk("dev %d:%d path %llu steps %u outcome %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->nr_steps, + __print_symbolic(__entry->outcome, XCHK_DIRPATH_OUTCOME_STRINGS)) +); +#define DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(name) \ +DEFINE_EVENT(xchk_dirpath_outcome_class, name, \ + TP_PROTO(struct xfs_scrub *sc, unsigned long long path_nr, \ + unsigned int nr_steps, \ + unsigned int outcome), \ + TP_ARGS(sc, path_nr, nr_steps, outcome)) +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_set_outcome); +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xchk_dirpath_evaluate_path); + +DECLARE_EVENT_CLASS(xchk_dirtree_evaluate_class, + TP_PROTO(const struct xchk_dirtree *dl, + const struct xchk_dirtree_outcomes *oc), + TP_ARGS(dl, oc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, rootino) + __field(unsigned int, nr_paths) + __field(unsigned int, bad) + __field(unsigned int, suspect) + __field(unsigned int, good) + __field(bool, needs_adoption) + ), + TP_fast_assign( + __entry->dev = dl->sc->mp->m_super->s_dev; + __entry->ino = dl->sc->ip->i_ino; + __entry->rootino = dl->root_ino; + __entry->nr_paths = dl->nr_paths; + __entry->bad = oc->bad; + __entry->suspect = oc->suspect; + __entry->good = oc->good; + __entry->needs_adoption = oc->needs_adoption ? 1 : 0; + ), + TP_printk("dev %d:%d ino 0x%llx rootino 0x%llx nr_paths %u bad %u suspect %u good %u adopt? %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->rootino, + __entry->nr_paths, + __entry->bad, + __entry->suspect, + __entry->good, + __entry->needs_adoption) +); +#define DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(name) \ +DEFINE_EVENT(xchk_dirtree_evaluate_class, name, \ + TP_PROTO(const struct xchk_dirtree *dl, \ + const struct xchk_dirtree_outcomes *oc), \ + TP_ARGS(dl, oc)) +DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xchk_dirtree_evaluate); + +TRACE_EVENT(xchk_dirpath_changed, + TP_PROTO(struct xfs_scrub *sc, unsigned int path_nr, + unsigned int step_nr, const struct xfs_inode *dp, + const struct xfs_inode *ip, const struct xfs_name *xname), + TP_ARGS(sc, path_nr, step_nr, dp, ip, xname), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, path_nr) + __field(unsigned int, step_nr) + __field(xfs_ino_t, child_ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, xname->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->path_nr = path_nr; + __entry->step_nr = step_nr; + __entry->child_ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->namelen = xname->len; + memcpy(__get_str(name), xname->name, xname->len); + ), + TP_printk("dev %d:%d path %u step %u child_ino 0x%llx parent_ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->path_nr, + __entry->step_nr, + __entry->child_ino, + __entry->parent_ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_dirtree_live_update, + TP_PROTO(struct xfs_scrub *sc, const struct xfs_inode *dp, + int action, const struct xfs_inode *ip, int delta, + const struct xfs_name *xname), + TP_ARGS(sc, dp, action, ip, delta, xname), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, parent_ino) + __field(int, action) + __field(xfs_ino_t, child_ino) + __field(int, delta) + __field(unsigned int, namelen) + __dynamic_array(char, name, xname->len) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->parent_ino = dp->i_ino; + __entry->action = action; + __entry->child_ino = ip->i_ino; + __entry->delta = delta; + __entry->namelen = xname->len; + memcpy(__get_str(name), xname->name, xname->len); + ), + TP_printk("dev %d:%d parent_ino 0x%llx child_ino 0x%llx nlink_delta %d name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->parent_ino, + __entry->child_ino, + __entry->delta, + __entry->namelen, + __get_str(name)) +); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) @@ -1533,6 +1947,7 @@ DEFINE_EVENT(xrep_extent_class, name, \ DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); +DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); DECLARE_EVENT_CLASS(xrep_reap_find_class, @@ -1566,6 +1981,7 @@ DEFINE_EVENT(xrep_reap_find_class, name, \ bool crosslinked), \ TP_ARGS(pag, agbno, len, crosslinked)) DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); +DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select); DECLARE_EVENT_CLASS(xrep_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -2273,6 +2689,891 @@ TRACE_EVENT(xrep_rmap_live_update, __entry->flags) ); +TRACE_EVENT(xrep_tempfile_create, + TP_PROTO(struct xfs_scrub *sc), + TP_ARGS(sc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, type) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(xfs_ino_t, temp_inum) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->file ? XFS_I(file_inode(sc->file))->i_ino : 0; + __entry->type = sc->sm->sm_type; + __entry->agno = sc->sm->sm_agno; + __entry->inum = sc->sm->sm_ino; + __entry->gen = sc->sm->sm_gen; + __entry->flags = sc->sm->sm_flags; + __entry->temp_inum = sc->tempip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx type %s inum 0x%llx gen 0x%x flags 0x%x temp_inum 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->inum, + __entry->gen, + __entry->flags, + __entry->temp_inum) +); + +DECLARE_EVENT_CLASS(xrep_tempfile_class, + TP_PROTO(struct xfs_scrub *sc, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(sc, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->tempip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); +#define DEFINE_XREP_TEMPFILE_EVENT(name) \ +DEFINE_EVENT(xrep_tempfile_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int whichfork, \ + struct xfs_bmbt_irec *irec), \ + TP_ARGS(sc, whichfork, irec)) +DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_prealloc); +DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_copyin); + +TRACE_EVENT(xreap_ifork_extent, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork, + const struct xfs_bmbt_irec *irec), + TP_ARGS(sc, ip, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, fileoff) + __field(xfs_filblks_t, len) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->whichfork = whichfork; + __entry->fileoff = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock); + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ip 0x%llx whichfork %s agno 0x%x agbno 0x%x fileoff 0x%llx fsbcount 0x%llx state 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->agno, + __entry->agbno, + __entry->fileoff, + __entry->len, + __entry->state) +); + +TRACE_EVENT(xreap_bmapi_binval_scan, + TP_PROTO(struct xfs_scrub *sc, const struct xfs_bmbt_irec *irec, + xfs_extlen_t scan_blocks), + TP_ARGS(sc, irec, scan_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_filblks_t, len) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, scan_blocks) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->len = irec->br_blockcount; + __entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock); + __entry->scan_blocks = scan_blocks; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%llx scan_blocks 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->scan_blocks) +); + +TRACE_EVENT(xrep_xattr_recover_leafblock, + TP_PROTO(struct xfs_inode *ip, xfs_dablk_t dabno, uint16_t magic), + TP_ARGS(ip, dabno, magic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_dablk_t, dabno) + __field(uint16_t, magic) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->dabno = dabno; + __entry->magic = magic; + ), + TP_printk("dev %d:%d ino 0x%llx dablk 0x%x magic 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->dabno, + __entry->magic) +); + +DECLARE_EVENT_CLASS(xrep_xattr_salvage_class, + TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, + unsigned int namelen, unsigned int valuelen), + TP_ARGS(ip, flags, name, namelen, valuelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, flags) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + __field(unsigned int, valuelen) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->flags = flags; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + __entry->valuelen = valuelen; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s name '%.*s' valuelen 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->flags, "|", XFS_ATTR_NAMESPACE_STR), + __entry->namelen, + __get_str(name), + __entry->valuelen) +); +#define DEFINE_XREP_XATTR_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_salvage_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int flags, char *name, \ + unsigned int namelen, unsigned int valuelen), \ + TP_ARGS(ip, flags, name, namelen, valuelen)) +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_salvage_rec); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_xattr_insert_rec); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_stash_xattr); +DEFINE_XREP_XATTR_SALVAGE_EVENT(xrep_parent_insert_xattr); + +DECLARE_EVENT_CLASS(xrep_pptr_salvage_class, + TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, + unsigned int namelen, const void *value, unsigned int valuelen), + TP_ARGS(ip, flags, name, namelen, value, valuelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + const struct xfs_parent_rec *rec = value; + + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = be64_to_cpu(rec->p_ino); + __entry->parent_gen = be32_to_cpu(rec->p_gen); + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_salvage_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int flags, const void *name, \ + unsigned int namelen, const void *value, unsigned int valuelen), \ + TP_ARGS(ip, flags, name, namelen, value, valuelen)) +DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr); +DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr); + +TRACE_EVENT(xrep_xattr_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), + TP_ARGS(ip, arg_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, src_ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->src_ino = arg_ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx src 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->src_ino) +) +#define DEFINE_XREP_XATTR_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), \ + TP_ARGS(ip, arg_ip)) +DEFINE_XREP_XATTR_EVENT(xrep_xattr_rebuild_tree); +DEFINE_XREP_XATTR_EVENT(xrep_xattr_reset_fork); +DEFINE_XREP_XATTR_EVENT(xrep_xattr_full_reset); + +DECLARE_EVENT_CLASS(xrep_xattr_pptr_scan_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, + const struct xfs_name *name), + TP_ARGS(ip, dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->parent_gen = VFS_IC(dp)->i_generation; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(name) \ +DEFINE_EVENT(xrep_xattr_pptr_scan_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \ + const struct xfs_name *name), \ + TP_ARGS(ip, dp, name)) +DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentadd); +DEFINE_XREP_XATTR_PPTR_SCAN_EVENT(xrep_xattr_stash_parentremove); + +TRACE_EVENT(xrep_dir_recover_dirblock, + TP_PROTO(struct xfs_inode *dp, xfs_dablk_t dabno, uint32_t magic, + uint32_t magic_guess), + TP_ARGS(dp, dabno, magic, magic_guess), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_dablk_t, dabno) + __field(uint32_t, magic) + __field(uint32_t, magic_guess) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->dabno = dabno; + __entry->magic = magic; + __entry->magic_guess = magic_guess; + ), + TP_printk("dev %d:%d dir 0x%llx dablk 0x%x magic 0x%x magic_guess 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->dabno, + __entry->magic, + __entry->magic_guess) +); + +DECLARE_EVENT_CLASS(xrep_dir_class, + TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), + TP_ARGS(dp, parent_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, parent_ino) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->parent_ino = parent_ino; + ), + TP_printk("dev %d:%d dir 0x%llx parent 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->parent_ino) +) +#define DEFINE_XREP_DIR_EVENT(name) \ +DEFINE_EVENT(xrep_dir_class, name, \ + TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), \ + TP_ARGS(dp, parent_ino)) +DEFINE_XREP_DIR_EVENT(xrep_dir_rebuild_tree); +DEFINE_XREP_DIR_EVENT(xrep_dir_reset_fork); +DEFINE_XREP_DIR_EVENT(xrep_parent_reset_dotdot); + +DECLARE_EVENT_CLASS(xrep_dirent_class, + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, + xfs_ino_t ino), + TP_ARGS(dp, name, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + __field(xfs_ino_t, ino) + __field(uint8_t, ftype) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + __entry->ino = ino; + __entry->ftype = name->type; + ), + TP_printk("dev %d:%d dir 0x%llx ftype %s name '%.*s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->namelen, + __get_str(name), + __entry->ino) +) +#define DEFINE_XREP_DIRENT_EVENT(name) \ +DEFINE_EVENT(xrep_dirent_class, name, \ + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, \ + xfs_ino_t ino), \ + TP_ARGS(dp, name, ino)) +DEFINE_XREP_DIRENT_EVENT(xrep_dir_salvage_entry); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_createname); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_createname); +DEFINE_XREP_DIRENT_EVENT(xrep_adoption_reparent); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_removename); +DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_removename); + +DECLARE_EVENT_CLASS(xrep_adoption_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), + TP_ARGS(dp, ip, moved), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, child_ino) + __field(bool, moved) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->child_ino = ip->i_ino; + __entry->moved = moved; + ), + TP_printk("dev %d:%d dir 0x%llx child 0x%llx moved? %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->child_ino, + __entry->moved) +); +#define DEFINE_XREP_ADOPTION_EVENT(name) \ +DEFINE_EVENT(xrep_adoption_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_inode *ip, bool moved), \ + TP_ARGS(dp, ip, moved)) +DEFINE_XREP_ADOPTION_EVENT(xrep_adoption_trans_roll); + +DECLARE_EVENT_CLASS(xrep_parent_salvage_class, + TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), + TP_ARGS(dp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir_ino) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = dp->i_mount->m_super->s_dev; + __entry->dir_ino = dp->i_ino; + __entry->ino = ino; + ), + TP_printk("dev %d:%d dir 0x%llx parent 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir_ino, + __entry->ino) +) +#define DEFINE_XREP_PARENT_SALVAGE_EVENT(name) \ +DEFINE_EVENT(xrep_parent_salvage_class, name, \ + TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), \ + TP_ARGS(dp, ino)) +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_dir_salvaged_parent); +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_dirent); +DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_from_dcache); + +DECLARE_EVENT_CLASS(xrep_pptr_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, + const struct xfs_parent_rec *pptr), + TP_ARGS(ip, name, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = be64_to_cpu(pptr->p_ino); + __entry->parent_gen = be32_to_cpu(pptr->p_gen); + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_name *name, \ + const struct xfs_parent_rec *pptr), \ + TP_ARGS(ip, name, pptr)) +DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentadd); +DEFINE_XREP_PPTR_EVENT(xrep_xattr_replay_parentremove); +DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentadd); +DEFINE_XREP_PPTR_EVENT(xrep_parent_replay_parentremove); + +DECLARE_EVENT_CLASS(xrep_pptr_scan_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, + const struct xfs_name *name), + TP_ARGS(ip, dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->parent_ino = dp->i_ino; + __entry->parent_gen = VFS_IC(dp)->i_generation; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx parent_gen 0x%x name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __entry->parent_gen, + __entry->namelen, + __get_str(name)) +) +#define DEFINE_XREP_PPTR_SCAN_EVENT(name) \ +DEFINE_EVENT(xrep_pptr_scan_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_inode *dp, \ + const struct xfs_name *name), \ + TP_ARGS(ip, dp, name)) +DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentadd); +DEFINE_XREP_PPTR_SCAN_EVENT(xrep_parent_stash_parentremove); + +TRACE_EVENT(xrep_nlinks_set_record, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *obs), + TP_ARGS(mp, ino, obs), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = obs->parents; + __entry->backrefs = obs->backrefs; + __entry->children = obs->children; + ), + TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents, + __entry->backrefs, + __entry->children) +); + +DECLARE_EVENT_CLASS(xrep_dentry_class, + TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), + TP_ARGS(mp, dentry), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, flags) + __field(unsigned long, ino) + __field(bool, positive) + __field(unsigned long, parent_ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, dentry->d_name.len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->flags = dentry->d_flags; + __entry->positive = d_is_positive(dentry); + if (dentry->d_parent && d_inode(dentry->d_parent)) + __entry->parent_ino = d_inode(dentry->d_parent)->i_ino; + else + __entry->parent_ino = -1UL; + __entry->ino = d_inode(dentry) ? d_inode(dentry)->i_ino : 0; + __entry->namelen = dentry->d_name.len; + memcpy(__get_str(name), dentry->d_name.name, dentry->d_name.len); + ), + TP_printk("dev %d:%d flags 0x%x positive? %d parent_ino 0x%lx ino 0x%lx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->flags, + __entry->positive, + __entry->parent_ino, + __entry->ino, + __entry->namelen, + __get_str(name)) +); +#define DEFINE_REPAIR_DENTRY_EVENT(name) \ +DEFINE_EVENT(xrep_dentry_class, name, \ + TP_PROTO(struct xfs_mount *mp, const struct dentry *dentry), \ + TP_ARGS(mp, dentry)) +DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_check_child); +DEFINE_REPAIR_DENTRY_EVENT(xrep_adoption_invalidate_child); +DEFINE_REPAIR_DENTRY_EVENT(xrep_dirtree_delete_child); + +TRACE_EVENT(xrep_symlink_salvage_target, + TP_PROTO(struct xfs_inode *ip, char *target, unsigned int targetlen), + TP_ARGS(ip, target, targetlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, targetlen) + __dynamic_array(char, target, targetlen + 1) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->targetlen = targetlen; + memcpy(__get_str(target), target, targetlen); + __get_str(target)[targetlen] = 0; + ), + TP_printk("dev %d:%d ip 0x%llx target '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->targetlen, + __get_str(target)) +); + +DECLARE_EVENT_CLASS(xrep_symlink_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ip 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +#define DEFINE_XREP_SYMLINK_EVENT(name) \ +DEFINE_EVENT(xrep_symlink_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_rebuild); +DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_reset_fork); + +TRACE_EVENT(xrep_iunlink_visit, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t bucket_agino, struct xfs_inode *ip), + TP_ARGS(pag, bucket, bucket_agino, ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, bucket) + __field(xfs_agino_t, bucket_agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agino = XFS_INO_TO_AGINO(pag->pag_mount, ip->i_ino); + __entry->bucket = bucket; + __entry->bucket_agino = bucket_agino; + __entry->prev_agino = ip->i_prev_unlinked; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x bucket_agino 0x%x prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->agino, + __entry->bucket_agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_reload_next, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino), + TP_ARGS(ip, prev_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, old_prev_agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + __field(unsigned int, nlink) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->old_prev_agino = ip->i_prev_unlinked; + __entry->prev_agino = prev_agino; + __entry->next_agino = ip->i_next_unlinked; + __entry->nlink = VFS_I(ip)->i_nlink; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u old_prev_agino %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->nlink, + __entry->old_prev_agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_reload_ondisk, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, nlink) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x nlink %u next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->nlink, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t prev_agino, xfs_agino_t next_agino), + TP_ARGS(pag, bucket, prev_agino, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->prev_agino = prev_agino; + __entry->next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->prev_agino, + __entry->next_agino) +); + +DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t prev_agino, xfs_agino_t next_agino), + TP_ARGS(pag, bucket, prev_agino, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->prev_agino = prev_agino; + __entry->next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u prev_agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->prev_agino, + __entry->next_agino) +); +#define DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(name) \ +DEFINE_EVENT(xrep_iunlink_resolve_class, name, \ + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, \ + xfs_agino_t prev_agino, xfs_agino_t next_agino), \ + TP_ARGS(pag, bucket, prev_agino, next_agino)) +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_uncached); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_wronglist); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_nolist); +DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_ok); + +TRACE_EVENT(xrep_iunlink_relink_next, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t next_agino), + TP_ARGS(ip, next_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, next_agino) + __field(xfs_agino_t, new_next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->next_agino = ip->i_next_unlinked; + __entry->new_next_agino = next_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->next_agino, + __entry->new_next_agino) +); + +TRACE_EVENT(xrep_iunlink_relink_prev, + TP_PROTO(struct xfs_inode *ip, xfs_agino_t prev_agino), + TP_ARGS(ip, prev_agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, new_prev_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->prev_agino = ip->i_prev_unlinked; + __entry->new_prev_agino = prev_agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x prev_agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS, + __entry->agino, + __entry->prev_agino, + __entry->new_prev_agino) +); + +TRACE_EVENT(xrep_iunlink_add_to_bucket, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t agino, xfs_agino_t curr_head), + TP_ARGS(pag, bucket, agino, curr_head), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->agino = agino; + __entry->next_agino = curr_head; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x next_agino 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->agino, + __entry->next_agino) +); + +TRACE_EVENT(xrep_iunlink_commit_bucket, + TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + xfs_agino_t old_agino, xfs_agino_t agino), + TP_ARGS(pag, bucket, old_agino, agino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_agino) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->bucket = bucket; + __entry->old_agino = old_agino; + __entry->agino = agino; + ), + TP_printk("dev %d:%d agno 0x%x bucket %u agino 0x%x -> 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_agino, + __entry->agino) +); + +DEFINE_XCHK_DIRPATH_OUTCOME_EVENT(xrep_dirpath_set_outcome); +DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path); +DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption); +DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index 17c982a4821d..9185ae7088d4 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -7,9 +7,9 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" -#include "scrub/scrub.h" #include "scrub/trace.h" /* @@ -486,6 +486,9 @@ xfarray_sortinfo_alloc( xfarray_sortinfo_lo(si)[0] = 0; xfarray_sortinfo_hi(si)[0] = array->nr - 1; + si->relax = INIT_XCHK_RELAX; + if (flags & XFARRAY_SORT_KILLABLE) + si->relax.interruptible = false; trace_xfarray_sort(si, nr_bytes); *infop = si; @@ -503,10 +506,7 @@ xfarray_sort_terminated( * few seconds so that we don't run afoul of the soft lockup watchdog * or RCU stall detector. */ - cond_resched(); - - if ((si->flags & XFARRAY_SORT_KILLABLE) && - fatal_signal_pending(current)) { + if (xchk_maybe_relax(&si->relax)) { if (*error == 0) *error = -EINTR; return true; @@ -1051,3 +1051,20 @@ out_free: kvfree(si); return error; } + +/* How many bytes is this array consuming? */ +unsigned long long +xfarray_bytes( + struct xfarray *array) +{ + return xfile_bytes(array->xfile); +} + +/* Empty the entire array. */ +void +xfarray_truncate( + struct xfarray *array) +{ + xfile_discard(array->xfile, 0, MAX_LFS_FILESIZE); + array->nr = 0; +} diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index acb2f94c56c1..5eeeeed13ae2 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -8,6 +8,7 @@ /* xfile array index type, along with cursor initialization */ typedef uint64_t xfarray_idx_t; +#define XFARRAY_NULLIDX ((__force xfarray_idx_t)-1ULL) #define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0) /* Iterate each index of an xfile array. */ @@ -44,6 +45,8 @@ int xfarray_unset(struct xfarray *array, xfarray_idx_t idx); int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); int xfarray_store_anywhere(struct xfarray *array, const void *ptr); bool xfarray_element_is_null(struct xfarray *array, const void *ptr); +void xfarray_truncate(struct xfarray *array); +unsigned long long xfarray_bytes(struct xfarray *array); /* * Load an array element, but zero the buffer if there's no data because we @@ -124,6 +127,9 @@ struct xfarray_sortinfo { /* XFARRAY_SORT_* flags; see below. */ unsigned int flags; + /* next time we want to cond_resched() */ + struct xchk_relax relax; + /* Cache a folio here for faster scanning for pivots */ struct folio *folio; diff --git a/fs/xfs/scrub/xfblob.c b/fs/xfs/scrub/xfblob.c new file mode 100644 index 000000000000..6ef2a9637f16 --- /dev/null +++ b/fs/xfs/scrub/xfblob.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "scrub/scrub.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/xfblob.h" + +/* + * XFS Blob Storage + * ================ + * Stores and retrieves blobs using an xfile. Objects are appended to the file + * and the offset is returned as a magic cookie for retrieval. + */ + +#define XB_KEY_MAGIC 0xABAADDAD +struct xb_key { + uint32_t xb_magic; /* XB_KEY_MAGIC */ + uint32_t xb_size; /* size of the blob, in bytes */ + loff_t xb_offset; /* byte offset of this key */ + /* blob comes after here */ +} __packed; + +/* Initialize a blob storage object. */ +int +xfblob_create( + const char *description, + struct xfblob **blobp) +{ + struct xfblob *blob; + struct xfile *xfile; + int error; + + error = xfile_create(description, 0, &xfile); + if (error) + return error; + + blob = kmalloc(sizeof(struct xfblob), XCHK_GFP_FLAGS); + if (!blob) { + error = -ENOMEM; + goto out_xfile; + } + + blob->xfile = xfile; + blob->last_offset = PAGE_SIZE; + + *blobp = blob; + return 0; + +out_xfile: + xfile_destroy(xfile); + return error; +} + +/* Destroy a blob storage object. */ +void +xfblob_destroy( + struct xfblob *blob) +{ + xfile_destroy(blob->xfile); + kfree(blob); +} + +/* Retrieve a blob. */ +int +xfblob_load( + struct xfblob *blob, + xfblob_cookie cookie, + void *ptr, + uint32_t size) +{ + struct xb_key key; + int error; + + error = xfile_load(blob->xfile, &key, sizeof(key), cookie); + if (error) + return error; + + if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) { + ASSERT(0); + return -ENODATA; + } + if (size < key.xb_size) { + ASSERT(0); + return -EFBIG; + } + + return xfile_load(blob->xfile, ptr, key.xb_size, + cookie + sizeof(key)); +} + +/* Store a blob. */ +int +xfblob_store( + struct xfblob *blob, + xfblob_cookie *cookie, + const void *ptr, + uint32_t size) +{ + struct xb_key key = { + .xb_offset = blob->last_offset, + .xb_magic = XB_KEY_MAGIC, + .xb_size = size, + }; + loff_t pos = blob->last_offset; + int error; + + error = xfile_store(blob->xfile, &key, sizeof(key), pos); + if (error) + return error; + + pos += sizeof(key); + error = xfile_store(blob->xfile, ptr, size, pos); + if (error) + goto out_err; + + *cookie = blob->last_offset; + blob->last_offset += sizeof(key) + size; + return 0; +out_err: + xfile_discard(blob->xfile, blob->last_offset, sizeof(key)); + return error; +} + +/* Free a blob. */ +int +xfblob_free( + struct xfblob *blob, + xfblob_cookie cookie) +{ + struct xb_key key; + int error; + + error = xfile_load(blob->xfile, &key, sizeof(key), cookie); + if (error) + return error; + + if (key.xb_magic != XB_KEY_MAGIC || key.xb_offset != cookie) { + ASSERT(0); + return -ENODATA; + } + + xfile_discard(blob->xfile, cookie, sizeof(key) + key.xb_size); + return 0; +} + +/* How many bytes is this blob storage object consuming? */ +unsigned long long +xfblob_bytes( + struct xfblob *blob) +{ + return xfile_bytes(blob->xfile); +} + +/* Drop all the blobs. */ +void +xfblob_truncate( + struct xfblob *blob) +{ + xfile_discard(blob->xfile, PAGE_SIZE, MAX_LFS_FILESIZE - PAGE_SIZE); + blob->last_offset = PAGE_SIZE; +} diff --git a/fs/xfs/scrub/xfblob.h b/fs/xfs/scrub/xfblob.h new file mode 100644 index 000000000000..ae78322613ca --- /dev/null +++ b/fs/xfs/scrub/xfblob.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFBLOB_H__ +#define __XFS_SCRUB_XFBLOB_H__ + +struct xfblob { + struct xfile *xfile; + loff_t last_offset; +}; + +typedef loff_t xfblob_cookie; + +int xfblob_create(const char *descr, struct xfblob **blobp); +void xfblob_destroy(struct xfblob *blob); +int xfblob_load(struct xfblob *blob, xfblob_cookie cookie, void *ptr, + uint32_t size); +int xfblob_store(struct xfblob *blob, xfblob_cookie *cookie, const void *ptr, + uint32_t size); +int xfblob_free(struct xfblob *blob, xfblob_cookie cookie); +unsigned long long xfblob_bytes(struct xfblob *blob); +void xfblob_truncate(struct xfblob *blob); + +static inline int +xfblob_storename( + struct xfblob *blob, + xfblob_cookie *cookie, + const struct xfs_name *xname) +{ + return xfblob_store(blob, cookie, xname->name, xname->len); +} + +static inline int +xfblob_loadname( + struct xfblob *blob, + xfblob_cookie cookie, + struct xfs_name *xname, + uint32_t size) +{ + int ret = xfblob_load(blob, cookie, (void *)xname->name, size); + if (ret) + return ret; + + xname->len = size; + return 0; +} + +#endif /* __XFS_SCRUB_XFBLOB_H__ */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 8cdd863db585..d848222f802b 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -10,9 +10,9 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" -#include "scrub/scrub.h" #include "scrub/trace.h" #include <linux/shmem_fs.h> @@ -310,3 +310,15 @@ xfile_put_folio( folio_unlock(folio); folio_put(folio); } + +/* Discard the page cache that's backing a range of the xfile. */ +void +xfile_discard( + struct xfile *xf, + loff_t pos, + u64 count) +{ + trace_xfile_discard(xf, pos, count); + + shmem_truncate_range(file_inode(xf->file), pos, pos + count - 1); +} diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h index 76d78dba7e34..cc2cc1714cd4 100644 --- a/fs/xfs/scrub/xfile.h +++ b/fs/xfs/scrub/xfile.h @@ -17,6 +17,7 @@ int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos); int xfile_store(struct xfile *xf, const void *buf, size_t count, loff_t pos); +void xfile_discard(struct xfile *xf, loff_t pos, u64 count); loff_t xfile_seek_data(struct xfile *xf, loff_t pos); #define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER) @@ -26,4 +27,9 @@ struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len, unsigned int flags); void xfile_put_folio(struct xfile *xf, struct folio *folio); +static inline unsigned long long xfile_bytes(struct xfile *xf) +{ + return file_inode(xf->file)->i_blocks << SECTOR_SHIFT; +} + #endif /* __XFS_SCRUB_XFILE_H__ */ diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index a39befa743ce..f17173b83e6f 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -7,9 +7,11 @@ #define __XFS_SCRUB_H__ #ifndef CONFIG_XFS_ONLINE_SCRUB -# define xfs_scrub_metadata(file, sm) (-ENOTTY) +# define xfs_ioc_scrub_metadata(f, a) (-ENOTTY) +# define xfs_ioc_scrubv_metadata(f, a) (-ENOTTY) #else -int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm); +int xfs_ioc_scrub_metadata(struct file *file, void __user *arg); +int xfs_ioc_scrubv_metadata(struct file *file, void __user *arg); #endif /* CONFIG_XFS_ONLINE_SCRUB */ #endif /* __XFS_SCRUB_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 4bf69c9c088e..c7c3dcfa2718 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -201,16 +201,17 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (!args.value) return -ENOMEM; xfs_acl_to_disk(args.value, acl); + error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT); + kvfree(args.value); + } else { + error = xfs_attr_change(&args, XFS_ATTRUPDATE_REMOVE); + /* + * If the attribute didn't exist to start with that's fine. + */ + if (error == -ENOATTR) + error = 0; } - error = xfs_attr_change(&args); - kvfree(args.value); - - /* - * If the attribute didn't exist to start with that's fine. - */ - if (!acl && error == -ENOATTR) - error = 0; if (!error) set_cached_acl(inode, type, acl); return error; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3f428620ebf2..6dead20338e2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -233,45 +233,6 @@ xfs_imap_valid( return true; } -/* - * Pass in a dellalloc extent and convert it to real extents, return the real - * extent that maps offset_fsb in wpc->iomap. - * - * The current page is held locked so nothing could have removed the block - * backing offset_fsb, although it could have moved from the COW to the data - * fork by another thread. - */ -static int -xfs_convert_blocks( - struct iomap_writepage_ctx *wpc, - struct xfs_inode *ip, - int whichfork, - loff_t offset) -{ - int error; - unsigned *seq; - - if (whichfork == XFS_COW_FORK) - seq = &XFS_WPC(wpc)->cow_seq; - else - seq = &XFS_WPC(wpc)->data_seq; - - /* - * Attempt to allocate whatever delalloc extent currently backs offset - * and put the result into wpc->iomap. Allocate in a loop because it - * may take several attempts to allocate real blocks for a contiguous - * delalloc extent if free space is sufficiently fragmented. - */ - do { - error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, - &wpc->iomap, seq); - if (error) - return error; - } while (wpc->iomap.offset + wpc->iomap.length <= offset); - - return 0; -} - static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, @@ -290,6 +251,7 @@ xfs_map_blocks( struct xfs_iext_cursor icur; int retries = 0; int error = 0; + unsigned int *seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -387,7 +349,19 @@ retry: trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: - error = xfs_convert_blocks(wpc, ip, whichfork, offset); + /* + * Convert a dellalloc extent to a real one. The current page is held + * locked so nothing could have removed the block backing offset_fsb, + * although it could have moved from the COW to the data fork by another + * thread. + */ + if (whichfork == XFS_COW_FORK) + seq = &XFS_WPC(wpc)->cow_seq; + else + seq = &XFS_WPC(wpc)->data_seq; + + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, + &wpc->iomap, seq); if (error) { /* * If we failed to find the extent in the COW fork we might have @@ -469,7 +443,6 @@ xfs_discard_folio( { struct xfs_inode *ip = XFS_I(folio->mapping->host); struct xfs_mount *mp = ip->i_mount; - int error; if (xfs_is_shutdown(mp)) return; @@ -483,11 +456,8 @@ xfs_discard_folio( * byte of the next folio. Hence the end offset is only dependent on the * folio itself and not the start offset that is passed in. */ - error = xfs_bmap_punch_delalloc_range(ip, pos, + xfs_bmap_punch_delalloc_range(ip, pos, folio_pos(folio) + folio_size(folio)); - - if (error && !xfs_is_shutdown(mp)) - xfs_alert(mp, "page discard unable to remove delalloc mapping."); } static const struct iomap_writeback_ops xfs_writeback_ops = { diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 9b4c61e1c22e..2b10ac4c5fce 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -27,6 +27,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_parent.h" struct kmem_cache *xfs_attri_cache; struct kmem_cache *xfs_attrd_cache; @@ -73,8 +74,12 @@ static inline struct xfs_attri_log_nameval * xfs_attri_log_nameval_alloc( const void *name, unsigned int name_len, + const void *new_name, + unsigned int new_name_len, const void *value, - unsigned int value_len) + unsigned int value_len, + const void *new_value, + unsigned int new_value_len) { struct xfs_attri_log_nameval *nv; @@ -83,15 +88,26 @@ xfs_attri_log_nameval_alloc( * this. But kvmalloc() utterly sucks, so we use our own version. */ nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + - name_len + value_len); + name_len + new_name_len + value_len + + new_value_len); nv->name.i_addr = nv + 1; nv->name.i_len = name_len; nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME; memcpy(nv->name.i_addr, name, name_len); + if (new_name_len) { + nv->new_name.i_addr = nv->name.i_addr + name_len; + nv->new_name.i_len = new_name_len; + memcpy(nv->new_name.i_addr, new_name, new_name_len); + } else { + nv->new_name.i_addr = NULL; + nv->new_name.i_len = 0; + } + nv->new_name.i_type = XLOG_REG_TYPE_ATTR_NEWNAME; + if (value_len) { - nv->value.i_addr = nv->name.i_addr + name_len; + nv->value.i_addr = nv->name.i_addr + name_len + new_name_len; nv->value.i_len = value_len; memcpy(nv->value.i_addr, value, value_len); } else { @@ -100,6 +116,17 @@ xfs_attri_log_nameval_alloc( } nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE; + if (new_value_len) { + nv->new_value.i_addr = nv->name.i_addr + name_len + + new_name_len + value_len; + nv->new_value.i_len = new_value_len; + memcpy(nv->new_value.i_addr, new_value, new_value_len); + } else { + nv->new_value.i_addr = NULL; + nv->new_value.i_len = 0; + } + nv->new_value.i_type = XLOG_REG_TYPE_ATTR_NEWVALUE; + refcount_set(&nv->refcount, 1); return nv; } @@ -145,11 +172,20 @@ xfs_attri_item_size( *nbytes += sizeof(struct xfs_attri_log_format) + xlog_calc_iovec_len(nv->name.i_len); - if (!nv->value.i_len) - return; + if (nv->new_name.i_len) { + *nvecs += 1; + *nbytes += xlog_calc_iovec_len(nv->new_name.i_len); + } - *nvecs += 1; - *nbytes += xlog_calc_iovec_len(nv->value.i_len); + if (nv->value.i_len) { + *nvecs += 1; + *nbytes += xlog_calc_iovec_len(nv->value.i_len); + } + + if (nv->new_value.i_len) { + *nvecs += 1; + *nbytes += xlog_calc_iovec_len(nv->new_value.i_len); + } } /* @@ -179,15 +215,28 @@ xfs_attri_item_format( ASSERT(nv->name.i_len > 0); attrip->attri_format.alfi_size++; + if (nv->new_name.i_len > 0) + attrip->attri_format.alfi_size++; + if (nv->value.i_len > 0) attrip->attri_format.alfi_size++; + if (nv->new_value.i_len > 0) + attrip->attri_format.alfi_size++; + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, &attrip->attri_format, sizeof(struct xfs_attri_log_format)); xlog_copy_from_iovec(lv, &vecp, &nv->name); + + if (nv->new_name.i_len > 0) + xlog_copy_from_iovec(lv, &vecp, &nv->new_name); + if (nv->value.i_len > 0) xlog_copy_from_iovec(lv, &vecp, &nv->value); + + if (nv->new_value.i_len > 0) + xlog_copy_from_iovec(lv, &vecp, &nv->new_value); } /* @@ -308,6 +357,12 @@ xfs_attrd_item_intent( return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; } +static inline unsigned int +xfs_attr_log_item_op(const struct xfs_attri_log_format *attrp) +{ + return attrp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; +} + /* Log an attr to the intent item. */ STATIC void xfs_attr_log_item( @@ -316,6 +371,8 @@ xfs_attr_log_item( const struct xfs_attr_intent *attr) { struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attr->xattri_nameval; + struct xfs_da_args *args = attr->xattri_da_args; /* * At this point the xfs_attr_intent has been constructed, and we've @@ -323,13 +380,30 @@ xfs_attr_log_item( * structure with fields from this xfs_attr_intent */ attrp = &attrip->attri_format; - attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; + attrp->alfi_ino = args->dp->i_ino; ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); attrp->alfi_op_flags = attr->xattri_op_flags; - attrp->alfi_value_len = attr->xattri_nameval->value.i_len; - attrp->alfi_name_len = attr->xattri_nameval->name.i_len; - ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); - attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter; + attrp->alfi_value_len = nv->value.i_len; + + switch (xfs_attr_log_item_op(attrp)) { + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + ASSERT(nv->value.i_len == nv->new_value.i_len); + + attrp->alfi_igen = VFS_I(args->dp)->i_generation; + attrp->alfi_old_name_len = nv->name.i_len; + attrp->alfi_new_name_len = nv->new_name.i_len; + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + attrp->alfi_igen = VFS_I(args->dp)->i_generation; + fallthrough; + default: + attrp->alfi_name_len = nv->name.i_len; + break; + } + + ASSERT(!(args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); + attrp->alfi_attr_filter = args->attr_filter; } /* Get an ATTRI. */ @@ -368,8 +442,11 @@ xfs_attr_create_intent( * Transfer our reference to the name/value buffer to the * deferred work state structure. */ - attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, - args->namelen, args->value, args->valuelen); + attr->xattri_nameval = xfs_attri_log_nameval_alloc( + args->name, args->namelen, + args->new_name, args->new_namelen, + args->value, args->valuelen, + args->new_value, args->new_valuelen); } attrip = xfs_attri_init(mp, attr->xattri_nameval); @@ -460,17 +537,19 @@ xfs_attri_item_match( return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id; } +static inline bool +xfs_attri_validate_namelen(unsigned int namelen) +{ + return namelen > 0 && namelen <= XATTR_NAME_MAX; +} + /* Is this recovered ATTRI format ok? */ static inline bool xfs_attri_validate( struct xfs_mount *mp, struct xfs_attri_log_format *attrp) { - unsigned int op = attrp->alfi_op_flags & - XFS_ATTRI_OP_FLAGS_TYPE_MASK; - - if (attrp->__pad != 0) - return false; + unsigned int op = xfs_attr_log_item_op(attrp); if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK) return false; @@ -478,24 +557,75 @@ xfs_attri_validate( if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) return false; - /* alfi_op_flags should be either a set or remove */ + if (!xfs_attr_check_namespace(attrp->alfi_attr_filter & + XFS_ATTR_NSP_ONDISK_MASK)) + return false; + switch (op) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + if (!xfs_has_parent(mp)) + return false; + if (attrp->alfi_value_len != sizeof(struct xfs_parent_rec)) + return false; + if (!xfs_attri_validate_namelen(attrp->alfi_name_len)) + return false; + if (!(attrp->alfi_attr_filter & XFS_ATTR_PARENT)) + return false; + break; case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: + if (!xfs_is_using_logged_xattrs(mp)) + return false; + if (attrp->alfi_value_len > XATTR_SIZE_MAX) + return false; + if (!xfs_attri_validate_namelen(attrp->alfi_name_len)) + return false; + break; case XFS_ATTRI_OP_FLAGS_REMOVE: + if (!xfs_is_using_logged_xattrs(mp)) + return false; + if (attrp->alfi_value_len != 0) + return false; + if (!xfs_attri_validate_namelen(attrp->alfi_name_len)) + return false; + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + if (!xfs_has_parent(mp)) + return false; + if (!xfs_attri_validate_namelen(attrp->alfi_old_name_len)) + return false; + if (!xfs_attri_validate_namelen(attrp->alfi_new_name_len)) + return false; + if (attrp->alfi_value_len != sizeof(struct xfs_parent_rec)) + return false; + if (!(attrp->alfi_attr_filter & XFS_ATTR_PARENT)) + return false; break; default: return false; } - if (attrp->alfi_value_len > XATTR_SIZE_MAX) - return false; + return xfs_verify_ino(mp, attrp->alfi_ino); +} - if ((attrp->alfi_name_len > XATTR_NAME_MAX) || - (attrp->alfi_name_len == 0)) - return false; +static int +xfs_attri_iread_extents( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; - return xfs_verify_ino(mp, attrp->alfi_ino); + error = xfs_trans_alloc_empty(ip->i_mount, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp); + + return error; } static inline struct xfs_attr_intent * @@ -508,20 +638,46 @@ xfs_attri_recover_work( { struct xfs_attr_intent *attr; struct xfs_da_args *args; + struct xfs_inode *ip; int local; int error; - error = xlog_recover_iget(mp, attrp->alfi_ino, ipp); - if (error) - return ERR_PTR(error); + /* + * Parent pointer attr items record the generation but regular logged + * xattrs do not; select the right iget function. + */ + switch (xfs_attr_log_item_op(attrp)) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + error = xlog_recover_iget_handle(mp, attrp->alfi_ino, + attrp->alfi_igen, &ip); + break; + default: + error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + break; + } + if (error) { + xfs_irele(ip); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attrp, + sizeof(*attrp)); + return ERR_PTR(-EFSCORRUPTED); + } + + if (xfs_inode_has_attr_fork(ip)) { + error = xfs_attri_iread_extents(ip); + if (error) { + xfs_irele(ip); + return ERR_PTR(error); + } + } attr = kzalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL); args = (struct xfs_da_args *)(attr + 1); attr->xattri_da_args = args; - attr->xattri_op_flags = attrp->alfi_op_flags & - XFS_ATTRI_OP_FLAGS_TYPE_MASK; + attr->xattri_op_flags = xfs_attr_log_item_op(attrp); /* * We're reconstructing the deferred work state structure from the @@ -531,35 +687,42 @@ xfs_attri_recover_work( attr->xattri_nameval = xfs_attri_log_nameval_get(nv); ASSERT(attr->xattri_nameval); - args->dp = *ipp; + args->dp = ip; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; args->name = nv->name.i_addr; args->namelen = nv->name.i_len; - args->hashval = xfs_da_hashname(args->name, args->namelen); + args->new_name = nv->new_name.i_addr; + args->new_namelen = nv->new_name.i_len; + args->value = nv->value.i_addr; + args->valuelen = nv->value.i_len; + args->new_value = nv->new_value.i_addr; + args->new_valuelen = nv->new_value.i_len; args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | XFS_DA_OP_LOGGED; + args->owner = args->dp->i_ino; + xfs_attr_sethash(args); - ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb)); - - switch (attr->xattri_op_flags) { + switch (xfs_attr_intent_op(attr)) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: - args->value = nv->value.i_addr; - args->valuelen = nv->value.i_len; args->total = xfs_attr_calc_size(args, &local); if (xfs_inode_hasattr(args->dp)) attr->xattri_dela_state = xfs_attr_init_replace_state(args); else attr->xattri_dela_state = xfs_attr_init_add_state(args); break; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: case XFS_ATTRI_OP_FLAGS_REMOVE: attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; } xfs_defer_add_item(dfp, &attr->xattri_list); + *ipp = ip; return attr; } @@ -591,7 +754,8 @@ xfs_attr_recover_work( */ attrp = &attrip->attri_format; if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) + !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr, + nv->name.i_len)) return -EFSCORRUPTED; attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv); @@ -614,16 +778,17 @@ xfs_attr_recover_work( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &attrip->attri_format, sizeof(attrip->attri_format)); - if (error) { - xfs_trans_cancel(tp); - goto out_unlock; - } + if (error) + goto out_cancel; error = xfs_defer_ops_capture_and_commit(tp, capture_list); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); return error; +out_cancel: + xfs_trans_cancel(tp); + goto out_unlock; } /* Re-log an intent item to push the log tail forward. */ @@ -649,9 +814,20 @@ xfs_attr_relog_intent( new_attrp = &new_attrip->attri_format; new_attrp->alfi_ino = old_attrp->alfi_ino; + new_attrp->alfi_igen = old_attrp->alfi_igen; new_attrp->alfi_op_flags = old_attrp->alfi_op_flags; new_attrp->alfi_value_len = old_attrp->alfi_value_len; - new_attrp->alfi_name_len = old_attrp->alfi_name_len; + + switch (xfs_attr_log_item_op(old_attrp)) { + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + new_attrp->alfi_new_name_len = old_attrp->alfi_new_name_len; + new_attrp->alfi_old_name_len = old_attrp->alfi_old_name_len; + break; + default: + new_attrp->alfi_name_len = old_attrp->alfi_name_len; + break; + } + new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; return &new_attrip->attri_item; @@ -679,6 +855,75 @@ xfs_attr_create_done( return &attrdp->attrd_item; } +void +xfs_attr_defer_add( + struct xfs_da_args *args, + enum xfs_attr_defer_op op) +{ + struct xfs_attr_intent *new; + unsigned int log_op = 0; + bool is_pptr = args->attr_filter & XFS_ATTR_PARENT; + + if (is_pptr) { + ASSERT(xfs_has_parent(args->dp->i_mount)); + ASSERT((args->attr_filter & ~XFS_ATTR_PARENT) == 0); + ASSERT(args->op_flags & XFS_DA_OP_LOGGED); + ASSERT(args->valuelen == sizeof(struct xfs_parent_rec)); + } + + new = kmem_cache_zalloc(xfs_attr_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + new->xattri_da_args = args; + + /* Compute log operation from the higher level op and namespace. */ + switch (op) { + case XFS_ATTR_DEFER_SET: + if (is_pptr) + log_op = XFS_ATTRI_OP_FLAGS_PPTR_SET; + else + log_op = XFS_ATTRI_OP_FLAGS_SET; + break; + case XFS_ATTR_DEFER_REPLACE: + if (is_pptr) + log_op = XFS_ATTRI_OP_FLAGS_PPTR_REPLACE; + else + log_op = XFS_ATTRI_OP_FLAGS_REPLACE; + break; + case XFS_ATTR_DEFER_REMOVE: + if (is_pptr) + log_op = XFS_ATTRI_OP_FLAGS_PPTR_REMOVE; + else + log_op = XFS_ATTRI_OP_FLAGS_REMOVE; + break; + default: + ASSERT(0); + break; + } + new->xattri_op_flags = log_op; + + /* Set up initial attr operation state. */ + switch (log_op) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_SET: + new->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + ASSERT(args->new_valuelen == args->valuelen); + new->xattri_dela_state = xfs_attr_init_replace_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REPLACE: + new->xattri_dela_state = xfs_attr_init_replace_state(args); + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_REMOVE: + new->xattri_dela_state = xfs_attr_init_remove_state(args); + break; + } + + xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); + trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); +} + const struct xfs_defer_op_type xfs_attr_defer_type = { .name = "attr", .max_items = 1, @@ -691,6 +936,56 @@ const struct xfs_defer_op_type xfs_attr_defer_type = { .relog_intent = xfs_attr_relog_intent, }; +static inline void * +xfs_attri_validate_name_iovec( + struct xfs_mount *mp, + struct xfs_attri_log_format *attri_formatp, + const struct xfs_log_iovec *iovec, + unsigned int name_len) +{ + if (iovec->i_len != xlog_calc_iovec_len(name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, sizeof(*attri_formatp)); + return NULL; + } + + if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->i_addr, + name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, sizeof(*attri_formatp)); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + iovec->i_addr, iovec->i_len); + return NULL; + } + + return iovec->i_addr; +} + +static inline void * +xfs_attri_validate_value_iovec( + struct xfs_mount *mp, + struct xfs_attri_log_format *attri_formatp, + const struct xfs_log_iovec *iovec, + unsigned int value_len) +{ + if (iovec->i_len != xlog_calc_iovec_len(value_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, sizeof(*attri_formatp)); + return NULL; + } + + if ((attri_formatp->alfi_attr_filter & XFS_ATTR_PARENT) && + !xfs_parent_valuecheck(mp, iovec->i_addr, value_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, sizeof(*attri_formatp)); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + iovec->i_addr, iovec->i_len); + return NULL; + } + + return iovec->i_addr; +} + STATIC int xlog_recover_attri_commit_pass2( struct xlog *log, @@ -702,51 +997,177 @@ xlog_recover_attri_commit_pass2( struct xfs_attri_log_item *attrip; struct xfs_attri_log_format *attri_formatp; struct xfs_attri_log_nameval *nv; - const void *attr_value = NULL; const void *attr_name; + const void *attr_value = NULL; + const void *attr_new_name = NULL; + const void *attr_new_value = NULL; size_t len; - - attri_formatp = item->ri_buf[0].i_addr; - attr_name = item->ri_buf[1].i_addr; + unsigned int name_len = 0; + unsigned int value_len = 0; + unsigned int new_name_len = 0; + unsigned int new_value_len = 0; + unsigned int op, i = 0; /* Validate xfs_attri_log_format before the large memory allocation */ len = sizeof(struct xfs_attri_log_format); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[i].i_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } + attri_formatp = item->ri_buf[i].i_addr; if (!xfs_attri_validate(mp, attri_formatp)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + attri_formatp, len); return -EFSCORRUPTED; } - /* Validate the attr name */ - if (item->ri_buf[1].i_len != - xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { + /* Check the number of log iovecs makes sense for the op code. */ + op = xfs_attr_log_item_op(attri_formatp); + switch (op) { + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + /* Log item, attr name, attr value */ + if (item->ri_total != 3) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + name_len = attri_formatp->alfi_name_len; + value_len = attri_formatp->alfi_value_len; + break; + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + /* Log item, attr name, attr value */ + if (item->ri_total != 3) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + name_len = attri_formatp->alfi_name_len; + value_len = attri_formatp->alfi_value_len; + break; + case XFS_ATTRI_OP_FLAGS_REMOVE: + /* Log item, attr name */ + if (item->ri_total != 2) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + name_len = attri_formatp->alfi_name_len; + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + /* + * Log item, attr name, new attr name, attr value, new attr + * value + */ + if (item->ri_total != 5) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + name_len = attri_formatp->alfi_old_name_len; + new_name_len = attri_formatp->alfi_new_name_len; + new_value_len = value_len = attri_formatp->alfi_value_len; + break; + default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + attri_formatp, len); return -EFSCORRUPTED; } + i++; - if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[1].i_addr, item->ri_buf[1].i_len); + /* Validate the attr name */ + attr_name = xfs_attri_validate_name_iovec(mp, attri_formatp, + &item->ri_buf[i], name_len); + if (!attr_name) return -EFSCORRUPTED; + i++; + + /* Validate the new attr name */ + if (new_name_len > 0) { + attr_new_name = xfs_attri_validate_name_iovec(mp, + attri_formatp, &item->ri_buf[i], + new_name_len); + if (!attr_new_name) + return -EFSCORRUPTED; + i++; } /* Validate the attr value, if present */ - if (attri_formatp->alfi_value_len != 0) { - if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { + if (value_len != 0) { + attr_value = xfs_attri_validate_value_iovec(mp, attri_formatp, + &item->ri_buf[i], value_len); + if (!attr_value) + return -EFSCORRUPTED; + i++; + } + + /* Validate the new attr value, if present */ + if (new_value_len != 0) { + attr_new_value = xfs_attri_validate_value_iovec(mp, + attri_formatp, &item->ri_buf[i], + new_value_len); + if (!attr_new_value) + return -EFSCORRUPTED; + i++; + } + + /* + * Make sure we got the correct number of buffers for the operation + * that we just loaded. + */ + if (i != item->ri_total) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + + switch (op) { + case XFS_ATTRI_OP_FLAGS_REMOVE: + /* Regular remove operations operate only on names. */ + if (attr_value != NULL || value_len != 0) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, - item->ri_buf[0].i_len); + attri_formatp, len); return -EFSCORRUPTED; } - - attr_value = item->ri_buf[2].i_addr; + fallthrough; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + /* + * Regular xattr set/remove/replace operations require a name + * and do not take a newname. Values are optional for set and + * replace. + * + * Name-value set/remove operations must have a name, do not + * take a newname, and can take a value. + */ + if (attr_name == NULL || name_len == 0) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + /* + * Name-value replace operations require the caller to + * specify the old and new names and values explicitly. + * Values are optional. + */ + if (attr_name == NULL || name_len == 0) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + if (attr_new_name == NULL || new_name_len == 0) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + break; } /* @@ -754,9 +1175,10 @@ xlog_recover_attri_commit_pass2( * name/value buffer to the recovered incore log item and drop our * reference. */ - nv = xfs_attri_log_nameval_alloc(attr_name, - attri_formatp->alfi_name_len, attr_value, - attri_formatp->alfi_value_len); + nv = xfs_attri_log_nameval_alloc(attr_name, name_len, + attr_new_name, new_name_len, + attr_value, value_len, + attr_new_value, new_value_len); attrip = xfs_attri_init(mp, nv); memcpy(&attrip->attri_format, attri_formatp, len); diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h index 3280a7930287..e74128cbb722 100644 --- a/fs/xfs/xfs_attr_item.h +++ b/fs/xfs/xfs_attr_item.h @@ -13,7 +13,9 @@ struct kmem_zone; struct xfs_attri_log_nameval { struct xfs_log_iovec name; + struct xfs_log_iovec new_name; /* PPTR_REPLACE only */ struct xfs_log_iovec value; + struct xfs_log_iovec new_value; /* PPTR_REPLACE only */ refcount_t refcount; /* name and value follow the end of this struct */ @@ -51,4 +53,12 @@ struct xfs_attrd_log_item { extern struct kmem_cache *xfs_attri_cache; extern struct kmem_cache *xfs_attrd_cache; +enum xfs_attr_defer_op { + XFS_ATTR_DEFER_SET, + XFS_ATTR_DEFER_REMOVE, + XFS_ATTR_DEFER_REPLACE, +}; + +void xfs_attr_defer_add(struct xfs_da_args *args, enum xfs_attr_defer_op op); + #endif /* __XFS_ATTR_ITEM_H__ */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a6819a642cc0..5c947e5ce8b8 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -82,7 +82,8 @@ xfs_attr_shortform_list( (dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) { for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(sfe->nameval, + !xfs_attr_namecheck(sfe->flags, + sfe->nameval, sfe->namelen))) { xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; @@ -91,6 +92,7 @@ xfs_attr_shortform_list( sfe->flags, sfe->nameval, (int)sfe->namelen, + &sfe->nameval[sfe->namelen], (int)sfe->valuelen); /* * Either search callback finished early or @@ -122,7 +124,8 @@ xfs_attr_shortform_list( for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (unlikely( ((char *)sfe < (char *)sf) || - ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { + ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)) || + !xfs_attr_check_namespace(sfe->flags))) { XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe, @@ -133,12 +136,16 @@ xfs_attr_shortform_list( } sbp->entno = i; - sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); sbp->name = sfe->nameval; sbp->namelen = sfe->namelen; /* These are bytes, and both on-disk, don't endian-flip */ + sbp->value = &sfe->nameval[sfe->namelen], sbp->valuelen = sfe->valuelen; sbp->flags = sfe->flags; + sbp->hash = xfs_attr_hashval(dp->i_mount, sfe->flags, + sfe->nameval, sfe->namelen, + sfe->nameval + sfe->namelen, + sfe->valuelen); sfe = xfs_attr_sf_nextentry(sfe); sbp++; nsbuf++; @@ -177,7 +184,7 @@ xfs_attr_shortform_list( cursor->offset = 0; } if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(sbp->name, + !xfs_attr_namecheck(sbp->flags, sbp->name, sbp->namelen))) { xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); error = -EFSCORRUPTED; @@ -187,6 +194,7 @@ xfs_attr_shortform_list( sbp->flags, sbp->name, sbp->namelen, + sbp->value, sbp->valuelen); if (context->seen_enough) break; @@ -214,6 +222,7 @@ xfs_attr_node_list_lookup( struct xfs_mount *mp = dp->i_mount; struct xfs_trans *tp = context->tp; struct xfs_buf *bp; + xfs_failaddr_t fa; int i; int error = 0; unsigned int expected_level = 0; @@ -238,6 +247,10 @@ xfs_attr_node_list_lookup( goto out_corruptbuf; } + fa = xfs_da3_node_header_check(bp, dp->i_ino); + if (fa) + goto out_corruptbuf; + xfs_da3_node_hdr_from_disk(mp, &nodehdr, node); /* Tree taller than we can handle; bail out! */ @@ -273,6 +286,12 @@ xfs_attr_node_list_lookup( } } + fa = xfs_attr3_leaf_header_check(bp, dp->i_ino); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + goto out_releasebuf; + } + if (expected_level != 0) goto out_corruptbuf; @@ -281,6 +300,7 @@ xfs_attr_node_list_lookup( out_corruptbuf: xfs_buf_mark_corrupt(bp); +out_releasebuf: xfs_trans_brelse(tp, bp); xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; @@ -297,6 +317,7 @@ xfs_attr_node_list( struct xfs_buf *bp; struct xfs_inode *dp = context->dp; struct xfs_mount *mp = dp->i_mount; + xfs_failaddr_t fa; int error = 0; trace_xfs_attr_node_list(context); @@ -310,46 +331,60 @@ xfs_attr_node_list( */ bp = NULL; if (cursor->blkno > 0) { + struct xfs_attr_leaf_entry *entries; + error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp, XFS_ATTR_FORK); if (xfs_metadata_is_sick(error)) xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); - if ((error != 0) && (error != -EFSCORRUPTED)) + if (error != 0 && error != -EFSCORRUPTED) return error; - if (bp) { - struct xfs_attr_leaf_entry *entries; + if (!bp) + goto need_lookup; - node = bp->b_addr; - switch (be16_to_cpu(node->hdr.info.magic)) { - case XFS_DA_NODE_MAGIC: - case XFS_DA3_NODE_MAGIC: - trace_xfs_attr_list_wrong_blk(context); + node = bp->b_addr; + switch (be16_to_cpu(node->hdr.info.magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + trace_xfs_attr_list_wrong_blk(context); + fa = xfs_da3_node_header_check(bp, dp->i_ino); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); + } + xfs_trans_brelse(context->tp, bp); + bp = NULL; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + leaf = bp->b_addr; + fa = xfs_attr3_leaf_header_check(bp, dp->i_ino); + if (fa) { + __xfs_buf_mark_corrupt(bp, fa); xfs_trans_brelse(context->tp, bp); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); bp = NULL; break; - case XFS_ATTR_LEAF_MAGIC: - case XFS_ATTR3_LEAF_MAGIC: - leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, - &leafhdr, leaf); - entries = xfs_attr3_leaf_entryp(leaf); - if (cursor->hashval > be32_to_cpu( - entries[leafhdr.count - 1].hashval)) { - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(context->tp, bp); - bp = NULL; - } else if (cursor->hashval <= be32_to_cpu( - entries[0].hashval)) { - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(context->tp, bp); - bp = NULL; - } - break; - default: + } + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, + &leafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + if (cursor->hashval > be32_to_cpu( + entries[leafhdr.count - 1].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(context->tp, bp); + bp = NULL; + } else if (cursor->hashval <= be32_to_cpu( + entries[0].hashval)) { trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(context->tp, bp); bp = NULL; } + break; + default: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(context->tp, bp); + bp = NULL; } } @@ -359,6 +394,7 @@ xfs_attr_node_list( * Note that start of node block is same as start of leaf block. */ if (bp == NULL) { +need_lookup: error = xfs_attr_node_list_lookup(context, cursor, &bp); if (error || !bp) return error; @@ -380,8 +416,8 @@ xfs_attr_node_list( break; cursor->blkno = leafhdr.forw; xfs_trans_brelse(context->tp, bp); - error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, - &bp); + error = xfs_attr3_leaf_read(context->tp, dp, dp->i_ino, + cursor->blkno, &bp); if (error) return error; } @@ -446,6 +482,7 @@ xfs_attr3_leaf_list_int( */ for (; i < ichdr.count; entry++, i++) { char *name; + void *value; int namelen, valuelen; if (be32_to_cpu(entry->hashval) != cursor->hashval) { @@ -463,6 +500,7 @@ xfs_attr3_leaf_list_int( name_loc = xfs_attr3_leaf_name_local(leaf, i); name = name_loc->nameval; namelen = name_loc->namelen; + value = &name_loc->nameval[name_loc->namelen]; valuelen = be16_to_cpu(name_loc->valuelen); } else { xfs_attr_leaf_name_remote_t *name_rmt; @@ -470,16 +508,18 @@ xfs_attr3_leaf_list_int( name_rmt = xfs_attr3_leaf_name_remote(leaf, i); name = name_rmt->name; namelen = name_rmt->namelen; + value = NULL; valuelen = be32_to_cpu(name_rmt->valuelen); } if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(name, namelen))) { + !xfs_attr_namecheck(entry->flags, name, + namelen))) { xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } context->put_listent(context, entry->flags, - name, namelen, valuelen); + name, namelen, value, valuelen); if (context->seen_enough) break; cursor->offset++; @@ -501,7 +541,8 @@ xfs_attr_leaf_list( trace_xfs_attr_leaf_list(context); context->cursor.blkno = 0; - error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp); + error = xfs_attr3_leaf_read(context->tp, context->dp, + context->dp->i_ino, 0, &bp); if (error) return error; @@ -515,6 +556,7 @@ xfs_attr_list_ilocked( struct xfs_attr_list_context *context) { struct xfs_inode *dp = context->dp; + int error; xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); @@ -525,6 +567,12 @@ xfs_attr_list_ilocked( return 0; if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_list(context); + + /* Prerequisite for xfs_attr_is_leaf */ + error = xfs_iread_extents(NULL, dp, XFS_ATTR_FORK); + if (error) + return error; + if (xfs_attr_is_leaf(dp)) return xfs_attr_leaf_list(context); return xfs_attr_node_list(context); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index d27859a684aa..a19d62e78aa1 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -524,9 +524,7 @@ xfs_bmap_recover_work( else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, iext_delta); + error = xfs_iext_count_extend(tp, ip, work->bi_whichfork, iext_delta); if (error) goto err_cancel; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 19e11d1da660..ac2e77ebb54c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -440,7 +440,7 @@ out_unlock_iolock: * if the ranges only partially overlap them, so it is up to the caller to * ensure that partial blocks are not passed in. */ -int +void xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, xfs_off_t start_byte, @@ -452,7 +452,6 @@ xfs_bmap_punch_delalloc_range( xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; - int error = 0; ASSERT(!xfs_need_iread_extents(ifp)); @@ -476,15 +475,13 @@ xfs_bmap_punch_delalloc_range( continue; } - error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, - &got, &del); - if (error || !xfs_iext_get_extent(ifp, &icur, &got)) + xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, &got, &del); + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; } /* @@ -542,7 +539,7 @@ xfs_can_free_eofblocks( * forever. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); - if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) + if (xfs_inode_has_bigrtalloc(ip)) end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) @@ -713,41 +710,37 @@ xfs_alloc_file_space( if (error) break; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_ADD_NOSPLIT_CNT); - if (error) - goto error; - - error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, - &nimaps); if (error) goto error; - ip->i_diflags |= XFS_DIFLAG_PREALLOC; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - break; - /* * If the allocator cannot find a single free extent large * enough to cover the start block of the requested range, - * xfs_bmapi_write will return 0 but leave *nimaps set to 0. + * xfs_bmapi_write will return -ENOSR. * * In that case we simply need to keep looping with the same * startoffset_fsb so that one of the following allocations * will eventually reach the requested range. */ - if (nimaps) { + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, + &nimaps); + if (error) { + if (error != -ENOSR) + goto error; + error = 0; + } else { startoffset_fsb += imapp->br_blockcount; allocatesize_fsb -= imapp->br_blockcount; } + + ip->i_diflags |= XFS_DIFLAG_PREALLOC; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); } return error; @@ -775,10 +768,8 @@ xfs_unmap_extent( if (error) return error; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -843,7 +834,7 @@ xfs_free_file_space( endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* We can only free complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { + if (xfs_inode_has_bigrtalloc(ip)) { startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb); endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb); } @@ -1054,10 +1045,8 @@ xfs_insert_file_space( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -1283,23 +1272,17 @@ xfs_swap_extent_rmap( trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); if (xfs_bmap_is_real_extent(&uirec)) { - error = xfs_iext_count_may_overflow(ip, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } if (xfs_bmap_is_real_extent(&irec)) { - error = xfs_iext_count_may_overflow(tip, + error = xfs_iext_count_extend(tp, tip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 77ecbb753ef2..51f84d8ff372 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -30,7 +30,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) } #endif /* CONFIG_XFS_RT */ -int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, +void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f0fa02264eda..8a0151e23f3d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -494,6 +494,9 @@ _xfs_buf_obj_cmp( * it stale has not yet committed. i.e. we are * reallocating a busy extent. Skip this buffer and * continue searching for an exact match. + * + * Note: If we're scanning for incore buffers to stale, don't + * complain if we find non-stale buffers. */ if (!(map->bm_flags & XBM_LIVESCAN)) ASSERT(bp->b_flags & XBF_STALE); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index cf9296b7e06f..06ac5a7de60a 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -157,7 +157,7 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; - error = xfs_dir3_block_read(args->trans, dp, &bp); + error = xfs_dir3_block_read(args->trans, dp, args->owner, &bp); if (error) return error; @@ -282,7 +282,8 @@ xfs_dir2_leaf_readbuf( new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); if (new_off > *cur_off) *cur_off = new_off; - error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, &bp); + error = xfs_dir3_data_read(args->trans, dp, args->owner, + map.br_startoff, 0, &bp); if (error) goto out; @@ -515,7 +516,6 @@ xfs_readdir( { struct xfs_da_args args = { NULL }; unsigned int lock_mode; - bool isblock; int error; trace_xfs_readdir(dp); @@ -532,23 +532,24 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; args.trans = tp; + args.owner = dp->i_ino; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) return xfs_dir2_sf_getdents(&args, ctx); lock_mode = xfs_ilock_data_map_shared(dp); - error = xfs_dir2_isblock(&args, &isblock); - if (error) - goto out_unlock; - - if (isblock) { + switch (xfs_dir2_format(&args, &error)) { + case XFS_DIR2_FMT_BLOCK: error = xfs_dir2_block_getdents(&args, ctx, &lock_mode); - goto out_unlock; + break; + case XFS_DIR2_FMT_LEAF: + case XFS_DIR2_FMT_NODE: + error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); + break; + default: + break; } - error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); - -out_unlock: if (lock_mode) xfs_iunlock(dp, lock_mode); return error; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 268bb734dc0a..25fe3b932b5a 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -145,14 +145,18 @@ xfs_discard_extents( return error; } +struct xfs_trim_cur { + xfs_agblock_t start; + xfs_extlen_t count; + xfs_agblock_t end; + xfs_extlen_t minlen; + bool by_bno; +}; static int xfs_trim_gather_extents( struct xfs_perag *pag, - xfs_daddr_t start, - xfs_daddr_t end, - xfs_daddr_t minlen, - struct xfs_alloc_rec_incore *tcur, + struct xfs_trim_cur *tcur, struct xfs_busy_extents *extents, uint64_t *blocks_trimmed) { @@ -179,21 +183,26 @@ xfs_trim_gather_extents( if (error) goto out_trans_cancel; - cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); - - /* - * Look up the extent length requested in the AGF and start with it. - */ - if (tcur->ar_startblock == NULLAGBLOCK) - error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); - else - error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, - tcur->ar_blockcount, &i); + if (tcur->by_bno) { + /* sub-AG discard request always starts at tcur->start */ + cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); + error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i); + if (!error && !i) + error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i); + } else if (tcur->start == 0) { + /* first time through a by-len starts with max length */ + cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); + error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i); + } else { + /* nth time through a by-len starts where we left off */ + cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); + error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i); + } if (error) goto out_del_cursor; if (i == 0) { /* nothing of that length left in the AG, we are done */ - tcur->ar_blockcount = 0; + tcur->count = 0; goto out_del_cursor; } @@ -204,8 +213,6 @@ xfs_trim_gather_extents( while (i) { xfs_agblock_t fbno; xfs_extlen_t flen; - xfs_daddr_t dbno; - xfs_extlen_t dlen; error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) @@ -221,38 +228,46 @@ xfs_trim_gather_extents( * Update the cursor to point at this extent so we * restart the next batch from this extent. */ - tcur->ar_startblock = fbno; - tcur->ar_blockcount = flen; - break; - } - - /* - * use daddr format for all range/len calculations as that is - * the format the range/len variables are supplied in by - * userspace. - */ - dbno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, fbno); - dlen = XFS_FSB_TO_BB(mp, flen); - - /* - * Too small? Give up. - */ - if (dlen < minlen) { - trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); - tcur->ar_blockcount = 0; + tcur->start = fbno; + tcur->count = flen; break; } /* * If the extent is entirely outside of the range we are - * supposed to discard skip it. Do not bother to trim - * down partially overlapping ranges for now. + * supposed to skip it. Do not bother to trim down partially + * overlapping ranges for now. */ - if (dbno + dlen < start || dbno > end) { + if (fbno + flen < tcur->start) { + trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); + goto next_extent; + } + if (fbno > tcur->end) { trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); + if (tcur->by_bno) { + tcur->count = 0; + break; + } goto next_extent; } + /* Trim the extent returned to the range we want. */ + if (fbno < tcur->start) { + flen -= tcur->start - fbno; + fbno = tcur->start; + } + if (fbno + flen > tcur->end + 1) + flen = tcur->end - fbno + 1; + + /* Too small? Give up. */ + if (flen < tcur->minlen) { + trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + if (tcur->by_bno) + goto next_extent; + tcur->count = 0; + break; + } + /* * If any blocks in the range are still busy, skip the * discard and try again the next time. @@ -266,7 +281,10 @@ xfs_trim_gather_extents( &extents->extent_list); *blocks_trimmed += flen; next_extent: - error = xfs_btree_decrement(cur, 0, &i); + if (tcur->by_bno) + error = xfs_btree_increment(cur, 0, &i); + else + error = xfs_btree_decrement(cur, 0, &i); if (error) break; @@ -276,7 +294,7 @@ next_extent: * is no more extents to search. */ if (i == 0) - tcur->ar_blockcount = 0; + tcur->count = 0; } /* @@ -306,17 +324,22 @@ xfs_trim_should_stop(void) static int xfs_trim_extents( struct xfs_perag *pag, - xfs_daddr_t start, - xfs_daddr_t end, - xfs_daddr_t minlen, + xfs_agblock_t start, + xfs_agblock_t end, + xfs_extlen_t minlen, uint64_t *blocks_trimmed) { - struct xfs_alloc_rec_incore tcur = { - .ar_blockcount = pag->pagf_longest, - .ar_startblock = NULLAGBLOCK, + struct xfs_trim_cur tcur = { + .start = start, + .count = pag->pagf_longest, + .end = end, + .minlen = minlen, }; int error = 0; + if (start != 0 || end != pag->block_count) + tcur.by_bno = true; + do { struct xfs_busy_extents *extents; @@ -330,8 +353,8 @@ xfs_trim_extents( extents->owner = extents; INIT_LIST_HEAD(&extents->extent_list); - error = xfs_trim_gather_extents(pag, start, end, minlen, - &tcur, extents, blocks_trimmed); + error = xfs_trim_gather_extents(pag, &tcur, extents, + blocks_trimmed); if (error) { kfree(extents); break; @@ -354,7 +377,7 @@ xfs_trim_extents( if (xfs_trim_should_stop()) break; - } while (tcur.ar_blockcount != 0); + } while (tcur.count != 0); return error; @@ -378,8 +401,10 @@ xfs_ioc_trim( unsigned int granularity = bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); struct fstrim_range range; - xfs_daddr_t start, end, minlen; - xfs_agnumber_t agno; + xfs_daddr_t start, end; + xfs_extlen_t minlen; + xfs_agnumber_t start_agno, end_agno; + xfs_agblock_t start_agbno, end_agbno; uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -399,7 +424,8 @@ xfs_ioc_trim( return -EFAULT; range.minlen = max_t(u64, granularity, range.minlen); - minlen = BTOBB(range.minlen); + minlen = XFS_B_TO_FSB(mp, range.minlen); + /* * Truncating down the len isn't actually quite correct, but using * BBTOB would mean we trivially get overflows for values @@ -413,15 +439,21 @@ xfs_ioc_trim( return -EINVAL; start = BTOBB(range.start); - end = start + BTOBBT(range.len) - 1; + end = min_t(xfs_daddr_t, start + BTOBBT(range.len), + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) - 1; + + start_agno = xfs_daddr_to_agno(mp, start); + start_agbno = xfs_daddr_to_agbno(mp, start); + end_agno = xfs_daddr_to_agno(mp, end); + end_agbno = xfs_daddr_to_agbno(mp, end); - if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) - end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1; + for_each_perag_range(mp, start_agno, end_agno, pag) { + xfs_agblock_t agend = pag->block_count; - agno = xfs_daddr_to_agno(mp, start); - for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { - error = xfs_trim_extents(pag, start, end, minlen, - &blocks_trimmed); + if (start_agno == end_agno) + agend = end_agbno; + error = xfs_trim_extents(pag, start_agbno, agend, minlen, + &blocks_trimmed); if (error) last_error = error; @@ -429,6 +461,7 @@ xfs_ioc_trim( xfs_perag_rele(pag); break; } + start_agbno = 0; } if (last_error) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index c98cb468c357..c1b211c260a9 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -341,11 +341,8 @@ xfs_dquot_disk_alloc( goto err_cancel; } - error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, quotip, - XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto err_cancel; @@ -357,7 +354,6 @@ xfs_dquot_disk_alloc( goto err_cancel; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); - ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -1371,6 +1367,47 @@ xfs_dqlock2( } } +static int +xfs_dqtrx_cmp( + const void *a, + const void *b) +{ + const struct xfs_dqtrx *qa = a; + const struct xfs_dqtrx *qb = b; + + if (qa->qt_dquot->q_id > qb->qt_dquot->q_id) + return 1; + if (qa->qt_dquot->q_id < qb->qt_dquot->q_id) + return -1; + return 0; +} + +void +xfs_dqlockn( + struct xfs_dqtrx *q) +{ + unsigned int i; + + BUILD_BUG_ON(XFS_QM_TRANS_MAXDQS > MAX_LOCKDEP_SUBCLASSES); + + /* Sort in order of dquot id, do not allow duplicates */ + for (i = 0; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++) { + unsigned int j; + + for (j = 0; j < i; j++) + ASSERT(q[i].qt_dquot != q[j].qt_dquot); + } + if (i == 0) + return; + + sort(q, i, sizeof(struct xfs_dqtrx), xfs_dqtrx_cmp, NULL); + + mutex_lock(&q[0].qt_dquot->q_qlock); + for (i = 1; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++) + mutex_lock_nested(&q[i].qt_dquot->q_qlock, + XFS_QLOCK_NESTED + i - 1); +} + int __init xfs_qm_init(void) { diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 956272d9b302..677bb2dc9ac9 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -223,6 +223,7 @@ int xfs_qm_dqget_uncached(struct xfs_mount *mp, void xfs_qm_dqput(struct xfs_dquot *dqp); void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +void xfs_dqlockn(struct xfs_dqtrx *q); void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 7ad0e92c6b5b..78cdc5064a8c 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -62,6 +62,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_ATTR_LEAF_TO_NODE, XFS_RANDOM_WB_DELAY_MS, XFS_RANDOM_WRITE_DELAY_MS, + XFS_RANDOM_EXCHMAPS_FINISH_ONE, }; struct xfs_errortag_attr { @@ -179,6 +180,7 @@ XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); +XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -224,6 +226,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), XFS_ERRORTAG_ATTR_LIST(write_delay_ms), + XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); diff --git a/fs/xfs/xfs_exchmaps_item.c b/fs/xfs/xfs_exchmaps_item.c new file mode 100644 index 000000000000..264a121c5e16 --- /dev/null +++ b/fs/xfs/xfs_exchmaps_item.c @@ -0,0 +1,614 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_shared.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_exchmaps_item.h" +#include "xfs_exchmaps.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_exchrange.h" +#include "xfs_trace.h" + +struct kmem_cache *xfs_xmi_cache; +struct kmem_cache *xfs_xmd_cache; + +static const struct xfs_item_ops xfs_xmi_item_ops; + +static inline struct xfs_xmi_log_item *XMI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_xmi_log_item, xmi_item); +} + +STATIC void +xfs_xmi_item_free( + struct xfs_xmi_log_item *xmi_lip) +{ + kvfree(xmi_lip->xmi_item.li_lv_shadow); + kmem_cache_free(xfs_xmi_cache, xmi_lip); +} + +/* + * Freeing the XMI requires that we remove it from the AIL if it has already + * been placed there. However, the XMI may not yet have been placed in the AIL + * when called by xfs_xmi_release() from XMD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the XMI. + */ +STATIC void +xfs_xmi_release( + struct xfs_xmi_log_item *xmi_lip) +{ + ASSERT(atomic_read(&xmi_lip->xmi_refcount) > 0); + if (atomic_dec_and_test(&xmi_lip->xmi_refcount)) { + xfs_trans_ail_delete(&xmi_lip->xmi_item, 0); + xfs_xmi_item_free(xmi_lip); + } +} + + +STATIC void +xfs_xmi_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_xmi_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the given xmi log + * item. We use only 1 iovec, and we point that at the xmi_log_format structure + * embedded in the xmi item. + */ +STATIC void +xfs_xmi_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + xmi_lip->xmi_format.xmi_type = XFS_LI_XMI; + xmi_lip->xmi_format.xmi_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_XMI_FORMAT, + &xmi_lip->xmi_format, + sizeof(struct xfs_xmi_log_format)); +} + +/* + * The unpin operation is the last place an XMI is manipulated in the log. It + * is either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the XMI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the XMI to either construct + * and commit the XMD or drop the XMD's reference in the event of error. Simply + * drop the log's XMI reference now that the log is done with it. + */ +STATIC void +xfs_xmi_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip); + + xfs_xmi_release(xmi_lip); +} + +/* + * The XMI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an XMD isn't going to be + * constructed and thus we free the XMI here directly. + */ +STATIC void +xfs_xmi_item_release( + struct xfs_log_item *lip) +{ + xfs_xmi_release(XMI_ITEM(lip)); +} + +/* Allocate and initialize an xmi item. */ +STATIC struct xfs_xmi_log_item * +xfs_xmi_init( + struct xfs_mount *mp) + +{ + struct xfs_xmi_log_item *xmi_lip; + + xmi_lip = kmem_cache_zalloc(xfs_xmi_cache, GFP_KERNEL | __GFP_NOFAIL); + + xfs_log_item_init(mp, &xmi_lip->xmi_item, XFS_LI_XMI, &xfs_xmi_item_ops); + xmi_lip->xmi_format.xmi_id = (uintptr_t)(void *)xmi_lip; + atomic_set(&xmi_lip->xmi_refcount, 2); + + return xmi_lip; +} + +static inline struct xfs_xmd_log_item *XMD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_xmd_log_item, xmd_item); +} + +STATIC void +xfs_xmd_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_xmd_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the given xmd log + * item. We use only 1 iovec, and we point that at the xmd_log_format structure + * embedded in the xmd item. + */ +STATIC void +xfs_xmd_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_xmd_log_item *xmd_lip = XMD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + xmd_lip->xmd_format.xmd_type = XFS_LI_XMD; + xmd_lip->xmd_format.xmd_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_XMD_FORMAT, &xmd_lip->xmd_format, + sizeof(struct xfs_xmd_log_format)); +} + +/* + * The XMD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the XMI and free the + * XMD. + */ +STATIC void +xfs_xmd_item_release( + struct xfs_log_item *lip) +{ + struct xfs_xmd_log_item *xmd_lip = XMD_ITEM(lip); + + xfs_xmi_release(xmd_lip->xmd_intent_log_item); + kvfree(xmd_lip->xmd_item.li_lv_shadow); + kmem_cache_free(xfs_xmd_cache, xmd_lip); +} + +static struct xfs_log_item * +xfs_xmd_item_intent( + struct xfs_log_item *lip) +{ + return &XMD_ITEM(lip)->xmd_intent_log_item->xmi_item; +} + +static const struct xfs_item_ops xfs_xmd_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, + .iop_size = xfs_xmd_item_size, + .iop_format = xfs_xmd_item_format, + .iop_release = xfs_xmd_item_release, + .iop_intent = xfs_xmd_item_intent, +}; + +/* Log file mapping exchange information in the intent item. */ +STATIC struct xfs_log_item * +xfs_exchmaps_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_xmi_log_item *xmi_lip; + struct xfs_exchmaps_intent *xmi; + struct xfs_xmi_log_format *xlf; + + ASSERT(count == 1); + + xmi = list_first_entry_or_null(items, struct xfs_exchmaps_intent, + xmi_list); + + xmi_lip = xfs_xmi_init(tp->t_mountp); + xlf = &xmi_lip->xmi_format; + + xlf->xmi_inode1 = xmi->xmi_ip1->i_ino; + xlf->xmi_igen1 = VFS_I(xmi->xmi_ip1)->i_generation; + xlf->xmi_inode2 = xmi->xmi_ip2->i_ino; + xlf->xmi_igen2 = VFS_I(xmi->xmi_ip2)->i_generation; + xlf->xmi_startoff1 = xmi->xmi_startoff1; + xlf->xmi_startoff2 = xmi->xmi_startoff2; + xlf->xmi_blockcount = xmi->xmi_blockcount; + xlf->xmi_isize1 = xmi->xmi_isize1; + xlf->xmi_isize2 = xmi->xmi_isize2; + xlf->xmi_flags = xmi->xmi_flags & XFS_EXCHMAPS_LOGGED_FLAGS; + + return &xmi_lip->xmi_item; +} + +STATIC struct xfs_log_item * +xfs_exchmaps_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(intent); + struct xfs_xmd_log_item *xmd_lip; + + xmd_lip = kmem_cache_zalloc(xfs_xmd_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &xmd_lip->xmd_item, XFS_LI_XMD, + &xfs_xmd_item_ops); + xmd_lip->xmd_intent_log_item = xmi_lip; + xmd_lip->xmd_format.xmd_xmi_id = xmi_lip->xmi_format.xmi_id; + + return &xmd_lip->xmd_item; +} + +/* Add this deferred XMI to the transaction. */ +void +xfs_exchmaps_defer_add( + struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi) +{ + trace_xfs_exchmaps_defer(tp->t_mountp, xmi); + + xfs_defer_add(tp, &xmi->xmi_list, &xfs_exchmaps_defer_type); +} + +static inline struct xfs_exchmaps_intent *xmi_entry(const struct list_head *e) +{ + return list_entry(e, struct xfs_exchmaps_intent, xmi_list); +} + +/* Cancel a deferred file mapping exchange. */ +STATIC void +xfs_exchmaps_cancel_item( + struct list_head *item) +{ + struct xfs_exchmaps_intent *xmi = xmi_entry(item); + + kmem_cache_free(xfs_exchmaps_intent_cache, xmi); +} + +/* Process a deferred file mapping exchange. */ +STATIC int +xfs_exchmaps_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + struct xfs_exchmaps_intent *xmi = xmi_entry(item); + int error; + + /* + * Exchange one more mappings between two files. If there's still more + * work to do, we want to requeue ourselves after all other pending + * deferred operations have finished. This includes all of the dfops + * that we queued directly as well as any new ones created in the + * process of finishing the others. Doing so prevents us from queuing + * a large number of XMI log items in kernel memory, which in turn + * prevents us from pinning the tail of the log (while logging those + * new XMI items) until the first XMI items can be processed. + */ + error = xfs_exchmaps_finish_one(tp, xmi); + if (error != -EAGAIN) + xfs_exchmaps_cancel_item(item); + return error; +} + +/* Abort all pending XMIs. */ +STATIC void +xfs_exchmaps_abort_intent( + struct xfs_log_item *intent) +{ + xfs_xmi_release(XMI_ITEM(intent)); +} + +/* Is this recovered XMI ok? */ +static inline bool +xfs_xmi_validate( + struct xfs_mount *mp, + struct xfs_xmi_log_item *xmi_lip) +{ + struct xfs_xmi_log_format *xlf = &xmi_lip->xmi_format; + + if (!xfs_has_exchange_range(mp)) + return false; + + if (xmi_lip->xmi_format.__pad != 0) + return false; + + if (xlf->xmi_flags & ~XFS_EXCHMAPS_LOGGED_FLAGS) + return false; + + if (!xfs_verify_ino(mp, xlf->xmi_inode1) || + !xfs_verify_ino(mp, xlf->xmi_inode2)) + return false; + + if (!xfs_verify_fileext(mp, xlf->xmi_startoff1, xlf->xmi_blockcount)) + return false; + + return xfs_verify_fileext(mp, xlf->xmi_startoff2, xlf->xmi_blockcount); +} + +/* + * Use the recovered log state to create a new request, estimate resource + * requirements, and create a new incore intent state. + */ +STATIC struct xfs_exchmaps_intent * +xfs_xmi_item_recover_intent( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + const struct xfs_xmi_log_format *xlf, + struct xfs_exchmaps_req *req, + struct xfs_inode **ipp1, + struct xfs_inode **ipp2) +{ + struct xfs_inode *ip1, *ip2; + struct xfs_exchmaps_intent *xmi; + int error; + + /* + * Grab both inodes and set IRECOVERY to prevent trimming of post-eof + * mappings and freeing of unlinked inodes until we're totally done + * processing files. The ondisk format of this new log item contains + * file handle information, which is why recovery for other items do + * not check the inode generation number. + */ + error = xlog_recover_iget_handle(mp, xlf->xmi_inode1, xlf->xmi_igen1, + &ip1); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, xlf, + sizeof(*xlf)); + return ERR_PTR(error); + } + + error = xlog_recover_iget_handle(mp, xlf->xmi_inode2, xlf->xmi_igen2, + &ip2); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, xlf, + sizeof(*xlf)); + goto err_rele1; + } + + req->ip1 = ip1; + req->ip2 = ip2; + req->startoff1 = xlf->xmi_startoff1; + req->startoff2 = xlf->xmi_startoff2; + req->blockcount = xlf->xmi_blockcount; + req->flags = xlf->xmi_flags & XFS_EXCHMAPS_PARAMS; + + xfs_exchrange_ilock(NULL, ip1, ip2); + error = xfs_exchmaps_estimate(req); + xfs_exchrange_iunlock(ip1, ip2); + if (error) + goto err_rele2; + + *ipp1 = ip1; + *ipp2 = ip2; + xmi = xfs_exchmaps_init_intent(req); + xfs_defer_add_item(dfp, &xmi->xmi_list); + return xmi; + +err_rele2: + xfs_irele(ip2); +err_rele1: + xfs_irele(ip1); + req->ip2 = req->ip1 = NULL; + return ERR_PTR(error); +} + +/* Process a file mapping exchange item that was recovered from the log. */ +STATIC int +xfs_exchmaps_recover_work( + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + struct xfs_exchmaps_req req = { .flags = 0 }; + struct xfs_trans_res resv; + struct xfs_exchmaps_intent *xmi; + struct xfs_log_item *lip = dfp->dfp_intent; + struct xfs_xmi_log_item *xmi_lip = XMI_ITEM(lip); + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_trans *tp; + struct xfs_inode *ip1, *ip2; + int error = 0; + + if (!xfs_xmi_validate(mp, xmi_lip)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &xmi_lip->xmi_format, + sizeof(xmi_lip->xmi_format)); + return -EFSCORRUPTED; + } + + xmi = xfs_xmi_item_recover_intent(mp, dfp, &xmi_lip->xmi_format, &req, + &ip1, &ip2); + if (IS_ERR(xmi)) + return PTR_ERR(xmi); + + trace_xfs_exchmaps_recover(mp, xmi); + + resv = xlog_recover_resv(&M_RES(mp)->tr_write); + error = xfs_trans_alloc(mp, &resv, req.resblks, 0, 0, &tp); + if (error) + goto err_rele; + + xfs_exchrange_ilock(tp, ip1, ip2); + + xfs_exchmaps_ensure_reflink(tp, xmi); + xfs_exchmaps_upgrade_extent_counts(tp, xmi); + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &xmi_lip->xmi_format, + sizeof(xmi_lip->xmi_format)); + if (error) + goto err_cancel; + + /* + * Commit transaction, which frees the transaction and saves the inodes + * for later replay activities. + */ + error = xfs_defer_ops_capture_and_commit(tp, capture_list); + goto err_unlock; + +err_cancel: + xfs_trans_cancel(tp); +err_unlock: + xfs_exchrange_iunlock(ip1, ip2); +err_rele: + xfs_irele(ip2); + xfs_irele(ip1); + return error; +} + +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_exchmaps_relog_intent( + struct xfs_trans *tp, + struct xfs_log_item *intent, + struct xfs_log_item *done_item) +{ + struct xfs_xmi_log_item *xmi_lip; + struct xfs_xmi_log_format *old_xlf, *new_xlf; + + old_xlf = &XMI_ITEM(intent)->xmi_format; + + xmi_lip = xfs_xmi_init(tp->t_mountp); + new_xlf = &xmi_lip->xmi_format; + + new_xlf->xmi_inode1 = old_xlf->xmi_inode1; + new_xlf->xmi_inode2 = old_xlf->xmi_inode2; + new_xlf->xmi_igen1 = old_xlf->xmi_igen1; + new_xlf->xmi_igen2 = old_xlf->xmi_igen2; + new_xlf->xmi_startoff1 = old_xlf->xmi_startoff1; + new_xlf->xmi_startoff2 = old_xlf->xmi_startoff2; + new_xlf->xmi_blockcount = old_xlf->xmi_blockcount; + new_xlf->xmi_flags = old_xlf->xmi_flags; + new_xlf->xmi_isize1 = old_xlf->xmi_isize1; + new_xlf->xmi_isize2 = old_xlf->xmi_isize2; + + return &xmi_lip->xmi_item; +} + +const struct xfs_defer_op_type xfs_exchmaps_defer_type = { + .name = "exchmaps", + .max_items = 1, + .create_intent = xfs_exchmaps_create_intent, + .abort_intent = xfs_exchmaps_abort_intent, + .create_done = xfs_exchmaps_create_done, + .finish_item = xfs_exchmaps_finish_item, + .cancel_item = xfs_exchmaps_cancel_item, + .recover_work = xfs_exchmaps_recover_work, + .relog_intent = xfs_exchmaps_relog_intent, +}; + +STATIC bool +xfs_xmi_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return XMI_ITEM(lip)->xmi_format.xmi_id == intent_id; +} + +static const struct xfs_item_ops xfs_xmi_item_ops = { + .flags = XFS_ITEM_INTENT, + .iop_size = xfs_xmi_item_size, + .iop_format = xfs_xmi_item_format, + .iop_unpin = xfs_xmi_item_unpin, + .iop_release = xfs_xmi_item_release, + .iop_match = xfs_xmi_item_match, +}; + +/* + * This routine is called to create an in-core file mapping exchange item from + * the xmi format structure which was logged on disk. It allocates an in-core + * xmi, copies the exchange information from the format structure into it, and + * adds the xmi to the AIL with the given LSN. + */ +STATIC int +xlog_recover_xmi_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_xmi_log_item *xmi_lip; + struct xfs_xmi_log_format *xmi_formatp; + size_t len; + + len = sizeof(struct xfs_xmi_log_format); + if (item->ri_buf[0].i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xmi_formatp = item->ri_buf[0].i_addr; + if (xmi_formatp->__pad != 0) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xmi_lip = xfs_xmi_init(mp); + memcpy(&xmi_lip->xmi_format, xmi_formatp, len); + + xlog_recover_intent_item(log, &xmi_lip->xmi_item, lsn, + &xfs_exchmaps_defer_type); + return 0; +} + +const struct xlog_recover_item_ops xlog_xmi_item_ops = { + .item_type = XFS_LI_XMI, + .commit_pass2 = xlog_recover_xmi_commit_pass2, +}; + +/* + * This routine is called when an XMD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding XMI if it + * was still in the log. To do this it searches the AIL for the XMI with an id + * equal to that in the XMD format structure. If we find it we drop the XMD + * reference, which removes the XMI from the AIL and frees it. + */ +STATIC int +xlog_recover_xmd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_xmd_log_format *xmd_formatp; + + xmd_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_xmd_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_XMI, xmd_formatp->xmd_xmi_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_xmd_item_ops = { + .item_type = XFS_LI_XMD, + .commit_pass2 = xlog_recover_xmd_commit_pass2, +}; diff --git a/fs/xfs/xfs_exchmaps_item.h b/fs/xfs/xfs_exchmaps_item.h new file mode 100644 index 000000000000..efa368d25d09 --- /dev/null +++ b/fs/xfs/xfs_exchmaps_item.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_EXCHMAPS_ITEM_H__ +#define __XFS_EXCHMAPS_ITEM_H__ + +/* + * The file mapping exchange intent item helps us exchange multiple file + * mappings between two inode forks. It does this by tracking the range of + * file block offsets that still need to be exchanged, and relogs as progress + * happens. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same transaction + * that records the associated bmbt updates. + * + * Should the system crash after the commit of the first transaction but + * before the commit of the final transaction in a series, log recovery will + * use the redo information recorded by the intent items to replay the + * rest of the mapping exchanges. + */ + +/* kernel only XMI/XMD definitions */ + +struct xfs_mount; +struct kmem_cache; + +/* + * This is the incore file mapping exchange intent log item. It is used to log + * the fact that we are exchanging mappings between two files. It is used in + * conjunction with the incore file mapping exchange done log item described + * below. + * + * These log items follow the same rules as struct xfs_efi_log_item; see the + * comments about that structure (in xfs_extfree_item.h) for more details. + */ +struct xfs_xmi_log_item { + struct xfs_log_item xmi_item; + atomic_t xmi_refcount; + struct xfs_xmi_log_format xmi_format; +}; + +/* + * This is the incore file mapping exchange done log item. It is used to log + * the fact that an exchange mentioned in an earlier xmi item have been + * performed. + */ +struct xfs_xmd_log_item { + struct xfs_log_item xmd_item; + struct xfs_xmi_log_item *xmd_intent_log_item; + struct xfs_xmd_log_format xmd_format; +}; + +extern struct kmem_cache *xfs_xmi_cache; +extern struct kmem_cache *xfs_xmd_cache; + +struct xfs_exchmaps_intent; + +void xfs_exchmaps_defer_add(struct xfs_trans *tp, + struct xfs_exchmaps_intent *xmi); + +#endif /* __XFS_EXCHMAPS_ITEM_H__ */ diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c new file mode 100644 index 000000000000..c8a655c92c92 --- /dev/null +++ b/fs/xfs/xfs_exchrange.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_quota.h" +#include "xfs_bmap_util.h" +#include "xfs_reflink.h" +#include "xfs_trace.h" +#include "xfs_exchrange.h" +#include "xfs_exchmaps.h" +#include "xfs_sb.h" +#include "xfs_icache.h" +#include "xfs_log.h" +#include "xfs_rtbitmap.h" +#include <linux/fsnotify.h> + +/* Lock (and optionally join) two inodes for a file range exchange. */ +void +xfs_exchrange_ilock( + struct xfs_trans *tp, + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + if (ip1 != ip2) + xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL, + ip2, XFS_ILOCK_EXCL); + else + xfs_ilock(ip1, XFS_ILOCK_EXCL); + if (tp) { + xfs_trans_ijoin(tp, ip1, 0); + if (ip2 != ip1) + xfs_trans_ijoin(tp, ip2, 0); + } + +} + +/* Unlock two inodes after a file range exchange operation. */ +void +xfs_exchrange_iunlock( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + if (ip2 != ip1) + xfs_iunlock(ip2, XFS_ILOCK_EXCL); + xfs_iunlock(ip1, XFS_ILOCK_EXCL); +} + +/* + * Estimate the resource requirements to exchange file contents between the two + * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to + * have flushed both inodes' pagecache and active direct-ios. + */ +int +xfs_exchrange_estimate( + struct xfs_exchmaps_req *req) +{ + int error; + + xfs_exchrange_ilock(NULL, req->ip1, req->ip2); + error = xfs_exchmaps_estimate(req); + xfs_exchrange_iunlock(req->ip1, req->ip2); + return error; +} + +#define QRETRY_IP1 (0x1) +#define QRETRY_IP2 (0x2) + +/* + * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip + * this if quota enforcement is disabled or if both inodes' dquots are the + * same. The qretry structure must be initialized to zeroes before the first + * call to this function. + */ +STATIC int +xfs_exchrange_reserve_quota( + struct xfs_trans *tp, + const struct xfs_exchmaps_req *req, + unsigned int *qretry) +{ + int64_t ddelta, rdelta; + int ip1_error = 0; + int error; + + /* + * Don't bother with a quota reservation if we're not enforcing them + * or the two inodes have the same dquots. + */ + if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || + (req->ip1->i_udquot == req->ip2->i_udquot && + req->ip1->i_gdquot == req->ip2->i_gdquot && + req->ip1->i_pdquot == req->ip2->i_pdquot)) + return 0; + + *qretry = 0; + + /* + * For each file, compute the net gain in the number of regular blocks + * that will be mapped into that file and reserve that much quota. The + * quota counts must be able to absorb at least that much space. + */ + ddelta = req->ip2_bcount - req->ip1_bcount; + rdelta = req->ip2_rtbcount - req->ip1_rtbcount; + if (ddelta > 0 || rdelta > 0) { + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, + ddelta > 0 ? ddelta : 0, + rdelta > 0 ? rdelta : 0, + false); + if (error == -EDQUOT || error == -ENOSPC) { + /* + * Save this error and see what happens if we try to + * reserve quota for ip2. Then report both. + */ + *qretry |= QRETRY_IP1; + ip1_error = error; + error = 0; + } + if (error) + return error; + } + if (ddelta < 0 || rdelta < 0) { + error = xfs_trans_reserve_quota_nblks(tp, req->ip2, + ddelta < 0 ? -ddelta : 0, + rdelta < 0 ? -rdelta : 0, + false); + if (error == -EDQUOT || error == -ENOSPC) + *qretry |= QRETRY_IP2; + if (error) + return error; + } + if (ip1_error) + return ip1_error; + + /* + * For each file, forcibly reserve the gross gain in mapped blocks so + * that we don't trip over any quota block reservation assertions. + * We must reserve the gross gain because the quota code subtracts from + * bcount the number of blocks that we unmap; it does not add that + * quantity back to the quota block reservation. + */ + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount, + req->ip1_rtbcount, true); + if (error) + return error; + + return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount, + req->ip2_rtbcount, true); +} + +/* Exchange the mappings (and hence the contents) of two files' forks. */ +STATIC int +xfs_exchrange_mappings( + const struct xfs_exchrange *fxr, + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + struct xfs_mount *mp = ip1->i_mount; + struct xfs_exchmaps_req req = { + .ip1 = ip1, + .ip2 = ip2, + .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset), + .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset), + .blockcount = XFS_B_TO_FSB(mp, fxr->length), + }; + struct xfs_trans *tp; + unsigned int qretry; + bool retried = false; + int error; + + trace_xfs_exchrange_mappings(fxr, ip1, ip2); + + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) + req.flags |= XFS_EXCHMAPS_SET_SIZES; + if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN) + req.flags |= XFS_EXCHMAPS_INO1_WRITTEN; + + /* + * Round the request length up to the nearest file allocation unit. + * The prep function already checked that the request offsets and + * length in @fxr are safe to round up. + */ + if (xfs_inode_has_bigrtalloc(ip2)) + req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount); + + error = xfs_exchrange_estimate(&req); + if (error) + return error; + +retry: + /* Allocate the transaction, lock the inodes, and join them. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0, + XFS_TRANS_RES_FDBLKS, &tp); + if (error) + return error; + + xfs_exchrange_ilock(tp, ip1, ip2); + + trace_xfs_exchrange_before(ip2, 2); + trace_xfs_exchrange_before(ip1, 1); + + error = xfs_exchmaps_check_forks(mp, &req); + if (error) + goto out_trans_cancel; + + /* + * Reserve ourselves some quota if any of them are in enforcing mode. + * In theory we only need enough to satisfy the change in the number + * of blocks between the two ranges being remapped. + */ + error = xfs_exchrange_reserve_quota(tp, &req, &qretry); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_exchrange_iunlock(ip1, ip2); + if (qretry & QRETRY_IP1) + xfs_blockgc_free_quota(ip1, 0); + if (qretry & QRETRY_IP2) + xfs_blockgc_free_quota(ip2, 0); + retried = true; + goto retry; + } + if (error) + goto out_trans_cancel; + + /* If we got this far on a dry run, all parameters are ok. */ + if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN) + goto out_trans_cancel; + + /* Update the mtime and ctime of both files. */ + if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1) + xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2) + xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + xfs_exchange_mappings(tp, &req); + + /* + * Force the log to persist metadata updates if the caller or the + * administrator requires this. The generic prep function already + * flushed the relevant parts of the page cache. + */ + if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp); + + trace_xfs_exchrange_after(ip2, 2); + trace_xfs_exchrange_after(ip1, 1); + + if (error) + goto out_unlock; + + /* + * If the caller wanted us to exchange the contents of two complete + * files of unequal length, exchange the incore sizes now. This should + * be safe because we flushed both files' page caches, exchanged all + * the mappings, and updated the ondisk sizes. + */ + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { + loff_t temp; + + temp = i_size_read(VFS_I(ip2)); + i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1))); + i_size_write(VFS_I(ip1), temp); + } + +out_unlock: + xfs_exchrange_iunlock(ip1, ip2); + return error; + +out_trans_cancel: + xfs_trans_cancel(tp); + goto out_unlock; +} + +/* + * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE. + * This part deals with struct file objects and byte ranges and does not deal + * with XFS-specific data structures such as xfs_inodes and block ranges. This + * separation may some day facilitate porting to another filesystem. + * + * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in + * file1 with the same number of bytes starting at fxr.file2_offset in file2. + * Implementations must call xfs_exchange_range_prep to prepare the two + * files prior to taking locks; and they must update the inode change and mod + * times of both files as part of the metadata update. The timestamp update + * and freshness checks must be done atomically as part of the data exchange + * operation to ensure correctness of the freshness check. + * xfs_exchange_range_finish must be called after the operation completes + * successfully but before locks are dropped. + */ + +/* Verify that we have security clearance to perform this operation. */ +static int +xfs_exchange_range_verify_area( + struct xfs_exchrange *fxr) +{ + int ret; + + ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length, + true); + if (ret) + return ret; + + return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length, + true); +} + +/* + * Performs necessary checks before doing a range exchange, having stabilized + * mutable inode attributes via i_rwsem. + */ +static inline int +xfs_exchange_range_checks( + struct xfs_exchrange *fxr, + unsigned int alloc_unit) +{ + struct inode *inode1 = file_inode(fxr->file1); + struct inode *inode2 = file_inode(fxr->file2); + uint64_t allocmask = alloc_unit - 1; + int64_t test_len; + uint64_t blen; + loff_t size1, size2, tmp; + int error; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2)) + return -EPERM; + if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) + return -ETXTBSY; + + size1 = i_size_read(inode1); + size2 = i_size_read(inode2); + + /* Ranges cannot start after EOF. */ + if (fxr->file1_offset > size1 || fxr->file2_offset > size2) + return -EINVAL; + + /* + * If the caller said to exchange to EOF, we set the length of the + * request large enough to cover everything to the end of both files. + */ + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { + fxr->length = max_t(int64_t, size1 - fxr->file1_offset, + size2 - fxr->file2_offset); + + error = xfs_exchange_range_verify_area(fxr); + if (error) + return error; + } + + /* + * The start of both ranges must be aligned to the file allocation + * unit. + */ + if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) || + !IS_ALIGNED(fxr->file2_offset, alloc_unit)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) || + check_add_overflow(fxr->file2_offset, fxr->length, &tmp)) + return -EINVAL; + + /* + * We require both ranges to end within EOF, unless we're exchanging + * to EOF. + */ + if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) && + (fxr->file1_offset + fxr->length > size1 || + fxr->file2_offset + fxr->length > size2)) + return -EINVAL; + + /* + * Make sure we don't hit any file size limits. If we hit any size + * limits such that test_length was adjusted, we abort the whole + * operation. + */ + test_len = fxr->length; + error = generic_write_check_limits(fxr->file2, fxr->file2_offset, + &test_len); + if (error) + return error; + error = generic_write_check_limits(fxr->file1, fxr->file1_offset, + &test_len); + if (error) + return error; + if (test_len != fxr->length) + return -EINVAL; + + /* + * If the user wanted us to exchange up to the infile's EOF, round up + * to the next allocation unit boundary for this check. Do the same + * for the outfile. + * + * Otherwise, reject the range length if it's not aligned to an + * allocation unit. + */ + if (fxr->file1_offset + fxr->length == size1) + blen = ALIGN(size1, alloc_unit) - fxr->file1_offset; + else if (fxr->file2_offset + fxr->length == size2) + blen = ALIGN(size2, alloc_unit) - fxr->file2_offset; + else if (!IS_ALIGNED(fxr->length, alloc_unit)) + return -EINVAL; + else + blen = fxr->length; + + /* Don't allow overlapped exchanges within the same file. */ + if (inode1 == inode2 && + fxr->file2_offset + blen > fxr->file1_offset && + fxr->file1_offset + blen > fxr->file2_offset) + return -EINVAL; + + /* + * Ensure that we don't exchange a partial EOF block into the middle of + * another file. + */ + if ((fxr->length & allocmask) == 0) + return 0; + + blen = fxr->length; + if (fxr->file2_offset + blen < size2) + blen &= ~allocmask; + + if (fxr->file1_offset + blen < size1) + blen &= ~allocmask; + + return blen == fxr->length ? 0 : -EINVAL; +} + +/* + * Check that the two inodes are eligible for range exchanges, the ranges make + * sense, and then flush all dirty data. Caller must ensure that the inodes + * have been locked against any other modifications. + */ +static inline int +xfs_exchange_range_prep( + struct xfs_exchrange *fxr, + unsigned int alloc_unit) +{ + struct inode *inode1 = file_inode(fxr->file1); + struct inode *inode2 = file_inode(fxr->file2); + bool same_inode = (inode1 == inode2); + int error; + + /* Check that we don't violate system file offset limits. */ + error = xfs_exchange_range_checks(fxr, alloc_unit); + if (error || fxr->length == 0) + return error; + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode1); + if (!same_inode) + inode_dio_wait(inode2); + + error = filemap_write_and_wait_range(inode1->i_mapping, + fxr->file1_offset, + fxr->file1_offset + fxr->length - 1); + if (error) + return error; + + error = filemap_write_and_wait_range(inode2->i_mapping, + fxr->file2_offset, + fxr->file2_offset + fxr->length - 1); + if (error) + return error; + + /* + * If the files or inodes involved require synchronous writes, amend + * the request to force the filesystem to flush all data and metadata + * to disk after the operation completes. + */ + if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) || + IS_SYNC(inode1) || IS_SYNC(inode2)) + fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC; + + return 0; +} + +/* + * Finish a range exchange operation, if it was successful. Caller must ensure + * that the inodes are still locked against any other modifications. + */ +static inline int +xfs_exchange_range_finish( + struct xfs_exchrange *fxr) +{ + int error; + + error = file_remove_privs(fxr->file1); + if (error) + return error; + if (file_inode(fxr->file1) == file_inode(fxr->file2)) + return 0; + + return file_remove_privs(fxr->file2); +} + +/* + * Check the alignment of an exchange request when the allocation unit size + * isn't a power of two. The generic file-level helpers use (fast) + * bitmask-based alignment checks, but here we have to use slow long division. + */ +static int +xfs_exchrange_check_rtalign( + const struct xfs_exchrange *fxr, + struct xfs_inode *ip1, + struct xfs_inode *ip2, + unsigned int alloc_unit) +{ + uint64_t length = fxr->length; + uint64_t blen; + loff_t size1, size2; + + size1 = i_size_read(VFS_I(ip1)); + size2 = i_size_read(VFS_I(ip2)); + + /* The start of both ranges must be aligned to a rt extent. */ + if (!isaligned_64(fxr->file1_offset, alloc_unit) || + !isaligned_64(fxr->file2_offset, alloc_unit)) + return -EINVAL; + + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) + length = max_t(int64_t, size1 - fxr->file1_offset, + size2 - fxr->file2_offset); + + /* + * If the user wanted us to exchange up to the infile's EOF, round up + * to the next rt extent boundary for this check. Do the same for the + * outfile. + * + * Otherwise, reject the range length if it's not rt extent aligned. + * We already confirmed the starting offsets' rt extent block + * alignment. + */ + if (fxr->file1_offset + length == size1) + blen = roundup_64(size1, alloc_unit) - fxr->file1_offset; + else if (fxr->file2_offset + length == size2) + blen = roundup_64(size2, alloc_unit) - fxr->file2_offset; + else if (!isaligned_64(length, alloc_unit)) + return -EINVAL; + else + blen = length; + + /* Don't allow overlapped exchanges within the same file. */ + if (ip1 == ip2 && + fxr->file2_offset + blen > fxr->file1_offset && + fxr->file1_offset + blen > fxr->file2_offset) + return -EINVAL; + + /* + * Ensure that we don't exchange a partial EOF rt extent into the + * middle of another file. + */ + if (isaligned_64(length, alloc_unit)) + return 0; + + blen = length; + if (fxr->file2_offset + length < size2) + blen = rounddown_64(blen, alloc_unit); + + if (fxr->file1_offset + blen < size1) + blen = rounddown_64(blen, alloc_unit); + + return blen == length ? 0 : -EINVAL; +} + +/* Prepare two files to have their data exchanged. */ +STATIC int +xfs_exchrange_prep( + struct xfs_exchrange *fxr, + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + struct xfs_mount *mp = ip2->i_mount; + unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2); + int error; + + trace_xfs_exchrange_prep(fxr, ip1, ip2); + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2)) + return -EINVAL; + + /* Check non-power of two alignment issues, if necessary. */ + if (!is_power_of_2(alloc_unit)) { + error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit); + if (error) + return error; + + /* + * Do the generic file-level checks with the regular block + * alignment. + */ + alloc_unit = mp->m_sb.sb_blocksize; + } + + error = xfs_exchange_range_prep(fxr, alloc_unit); + if (error || fxr->length == 0) + return error; + + /* Attach dquots to both inodes before changing block maps. */ + error = xfs_qm_dqattach(ip2); + if (error) + return error; + error = xfs_qm_dqattach(ip1); + if (error) + return error; + + trace_xfs_exchrange_flush(fxr, ip1, ip2); + + /* Flush the relevant ranges of both files. */ + error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length); + if (error) + return error; + error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length); + if (error) + return error; + + /* + * Cancel CoW fork preallocations for the ranges of both files. The + * prep function should have flushed all the dirty data, so the only + * CoW mappings remaining should be speculative. + */ + if (xfs_inode_has_cow_data(ip1)) { + error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset, + fxr->length, true); + if (error) + return error; + } + + if (xfs_inode_has_cow_data(ip2)) { + error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset, + fxr->length, true); + if (error) + return error; + } + + return 0; +} + +/* + * Exchange contents of files. This is the binding between the generic + * file-level concepts and the XFS inode-specific implementation. + */ +STATIC int +xfs_exchrange_contents( + struct xfs_exchrange *fxr) +{ + struct inode *inode1 = file_inode(fxr->file1); + struct inode *inode2 = file_inode(fxr->file2); + struct xfs_inode *ip1 = XFS_I(inode1); + struct xfs_inode *ip2 = XFS_I(inode2); + struct xfs_mount *mp = ip1->i_mount; + int error; + + if (!xfs_has_exchange_range(mp)) + return -EOPNOTSUPP; + + if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | + XFS_EXCHANGE_RANGE_PRIV_FLAGS)) + return -EINVAL; + + if (xfs_is_shutdown(mp)) + return -EIO; + + /* Lock both files against IO */ + error = xfs_ilock2_io_mmap(ip1, ip2); + if (error) + goto out_err; + + /* Prepare and then exchange file contents. */ + error = xfs_exchrange_prep(fxr, ip1, ip2); + if (error) + goto out_unlock; + + error = xfs_exchrange_mappings(fxr, ip1, ip2); + if (error) + goto out_unlock; + + /* + * Finish the exchange by removing special file privileges like any + * other file write would do. This may involve turning on support for + * logged xattrs if either file has security capabilities. + */ + error = xfs_exchange_range_finish(fxr); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock2_io_mmap(ip1, ip2); +out_err: + if (error) + trace_xfs_exchrange_error(ip2, error, _RET_IP_); + return error; +} + +/* Exchange parts of two files. */ +static int +xfs_exchange_range( + struct xfs_exchrange *fxr) +{ + struct inode *inode1 = file_inode(fxr->file1); + struct inode *inode2 = file_inode(fxr->file2); + int ret; + + BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & + XFS_EXCHANGE_RANGE_PRIV_FLAGS); + + /* Both files must be on the same mount/filesystem. */ + if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt) + return -EXDEV; + + if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) + return -EINVAL; + + /* Userspace requests only honored for regular files. */ + if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode)) + return -EISDIR; + if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) + return -EINVAL; + + /* Both files must be opened for read and write. */ + if (!(fxr->file1->f_mode & FMODE_READ) || + !(fxr->file1->f_mode & FMODE_WRITE) || + !(fxr->file2->f_mode & FMODE_READ) || + !(fxr->file2->f_mode & FMODE_WRITE)) + return -EBADF; + + /* Neither file can be opened append-only. */ + if ((fxr->file1->f_flags & O_APPEND) || + (fxr->file2->f_flags & O_APPEND)) + return -EBADF; + + /* + * If we're not exchanging to EOF, we can check the areas before + * stabilizing both files' i_size. + */ + if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) { + ret = xfs_exchange_range_verify_area(fxr); + if (ret) + return ret; + } + + /* Update cmtime if the fd/inode don't forbid it. */ + if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) + fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1; + if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)) + fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2; + + file_start_write(fxr->file2); + ret = xfs_exchrange_contents(fxr); + file_end_write(fxr->file2); + if (ret) + return ret; + + fsnotify_modify(fxr->file1); + if (fxr->file2 != fxr->file1) + fsnotify_modify(fxr->file2); + return 0; +} + +/* Collect exchange-range arguments from userspace. */ +long +xfs_ioc_exchange_range( + struct file *file, + struct xfs_exchange_range __user *argp) +{ + struct xfs_exchrange fxr = { + .file2 = file, + }; + struct xfs_exchange_range args; + struct fd file1; + int error; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + if (memchr_inv(&args.pad, 0, sizeof(args.pad))) + return -EINVAL; + if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) + return -EINVAL; + + fxr.file1_offset = args.file1_offset; + fxr.file2_offset = args.file2_offset; + fxr.length = args.length; + fxr.flags = args.flags; + + file1 = fdget(args.file1_fd); + if (!file1.file) + return -EBADF; + fxr.file1 = file1.file; + + error = xfs_exchange_range(&fxr); + fdput(file1); + return error; +} diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h new file mode 100644 index 000000000000..039abcca546e --- /dev/null +++ b/fs/xfs/xfs_exchrange.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_EXCHRANGE_H__ +#define __XFS_EXCHRANGE_H__ + +/* Update the mtime/cmtime of file1 and file2 */ +#define __XFS_EXCHANGE_RANGE_UPD_CMTIME1 (1ULL << 63) +#define __XFS_EXCHANGE_RANGE_UPD_CMTIME2 (1ULL << 62) + +#define XFS_EXCHANGE_RANGE_PRIV_FLAGS (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \ + __XFS_EXCHANGE_RANGE_UPD_CMTIME2) + +struct xfs_exchrange { + struct file *file1; + struct file *file2; + + loff_t file1_offset; + loff_t file2_offset; + u64 length; + + u64 flags; /* XFS_EXCHANGE_RANGE flags */ +}; + +long xfs_ioc_exchange_range(struct file *file, + struct xfs_exchange_range __user *argp); + +struct xfs_exchmaps_req; + +void xfs_exchrange_ilock(struct xfs_trans *tp, struct xfs_inode *ip1, + struct xfs_inode *ip2); +void xfs_exchrange_iunlock(struct xfs_inode *ip1, struct xfs_inode *ip2); + +int xfs_exchrange_estimate(struct xfs_exchmaps_req *req); + +#endif /* __XFS_EXCHRANGE_H__ */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 7cd09c3a82cb..201489d3de08 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -102,7 +102,7 @@ xfs_fs_encode_fh( return fileid_type; } -STATIC struct inode * +struct inode * xfs_nfs_get_inode( struct super_block *sb, u64 ino, @@ -160,7 +160,7 @@ xfs_nfs_get_inode( } } - if (VFS_I(ip)->i_generation != generation) { + if (VFS_I(ip)->i_generation != generation || IS_PRIVATE(VFS_I(ip))) { xfs_irele(ip); return ERR_PTR(-ESTALE); } diff --git a/fs/xfs/xfs_export.h b/fs/xfs/xfs_export.h index 64471a3ddb04..3cd85e8901a5 100644 --- a/fs/xfs/xfs_export.h +++ b/fs/xfs/xfs_export.h @@ -57,4 +57,6 @@ struct xfs_fid64 { /* This flag goes on the wire. Don't play with it. */ #define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */ +struct inode *xfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 gen); + #endif /* __XFS_EXPORT_H__ */ diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 56cfa1498571..a73e7c73b664 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -518,35 +518,26 @@ fail: goto out; } -STATIC void +static bool xfs_extent_busy_clear_one( - struct xfs_mount *mp, struct xfs_perag *pag, - struct xfs_extent_busy *busyp) + struct xfs_extent_busy *busyp, + bool do_discard) { if (busyp->length) { - trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, - busyp->length); + if (do_discard && + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { + busyp->flags = XFS_EXTENT_BUSY_DISCARDED; + return false; + } + trace_xfs_extent_busy_clear(pag->pag_mount, busyp->agno, + busyp->bno, busyp->length); rb_erase(&busyp->rb_node, &pag->pagb_tree); } list_del_init(&busyp->list); kfree(busyp); -} - -static void -xfs_extent_busy_put_pag( - struct xfs_perag *pag, - bool wakeup) - __releases(pag->pagb_lock) -{ - if (wakeup) { - pag->pagb_gen++; - wake_up_all(&pag->pagb_wait); - } - - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); + return true; } /* @@ -560,32 +551,33 @@ xfs_extent_busy_clear( struct list_head *list, bool do_discard) { - struct xfs_extent_busy *busyp, *n; - struct xfs_perag *pag = NULL; - xfs_agnumber_t agno = NULLAGNUMBER; - bool wakeup = false; - - list_for_each_entry_safe(busyp, n, list, list) { - if (busyp->agno != agno) { - if (pag) - xfs_extent_busy_put_pag(pag, wakeup); - agno = busyp->agno; - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); - wakeup = false; - } + struct xfs_extent_busy *busyp, *next; - if (do_discard && busyp->length && - !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { - busyp->flags = XFS_EXTENT_BUSY_DISCARDED; - } else { - xfs_extent_busy_clear_one(mp, pag, busyp); - wakeup = true; - } - } + busyp = list_first_entry_or_null(list, typeof(*busyp), list); + if (!busyp) + return; - if (pag) - xfs_extent_busy_put_pag(pag, wakeup); + do { + bool wakeup = false; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + do { + next = list_next_entry(busyp, list); + if (xfs_extent_busy_clear_one(pag, busyp, do_discard)) + wakeup = true; + busyp = next; + } while (!list_entry_is_head(busyp, list, list) && + busyp->agno == pag->pag_agno); + + if (wakeup) { + pag->pagb_gen++; + wake_up_all(&pag->pagb_wait); + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } while (!list_entry_is_head(busyp, list, list)); } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2ce302b4885f..b240ea5241dc 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -24,6 +24,7 @@ #include "xfs_pnfs.h" #include "xfs_iomap.h" #include "xfs_reflink.h" +#include "xfs_file.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -38,33 +39,19 @@ static const struct vm_operations_struct xfs_file_vm_ops; * Decide if the given file range is aligned to the size of the fundamental * allocation unit for the file. */ -static bool +bool xfs_is_falloc_aligned( struct xfs_inode *ip, loff_t pos, long long int len) { - struct xfs_mount *mp = ip->i_mount; - uint64_t mask; - - if (XFS_IS_REALTIME_INODE(ip)) { - if (!is_power_of_2(mp->m_sb.sb_rextsize)) { - u64 rextbytes; - u32 mod; - - rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); - div_u64_rem(pos, rextbytes, &mod); - if (mod) - return false; - div_u64_rem(len, rextbytes, &mod); - return mod == 0; - } - mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1; - } else { - mask = mp->m_sb.sb_blocksize - 1; - } + unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip); + + if (!is_power_of_2(alloc_unit)) + return isaligned_64(pos, alloc_unit) && + isaligned_64(len, alloc_unit); - return !((pos | len) & mask); + return !((pos | len) & (alloc_unit - 1)); } /* @@ -861,67 +848,6 @@ xfs_file_write_iter( return xfs_file_buffered_write(iocb, from); } -static void -xfs_wait_dax_page( - struct inode *inode) -{ - struct xfs_inode *ip = XFS_I(inode); - - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); - schedule(); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); -} - -int -xfs_break_dax_layouts( - struct inode *inode, - bool *retry) -{ - struct page *page; - - xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL); - - page = dax_layout_busy_page(inode->i_mapping); - if (!page) - return 0; - - *retry = true; - return ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, - 0, 0, xfs_wait_dax_page(inode)); -} - -int -xfs_break_layouts( - struct inode *inode, - uint *iolock, - enum layout_break_reason reason) -{ - bool retry; - int error; - - xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); - - do { - retry = false; - switch (reason) { - case BREAK_UNMAP: - error = xfs_break_dax_layouts(inode, &retry); - if (error || retry) - break; - fallthrough; - case BREAK_WRITE: - error = xfs_break_leased_layouts(inode, iolock, &retry); - break; - default: - WARN_ON_ONCE(1); - error = -EINVAL; - } - } while (error == 0 && retry); - - return error; -} - /* Does this file, inode, or mount want synchronous writes? */ static inline bool xfs_file_sync_writes(struct file *filp) { diff --git a/fs/xfs/xfs_file.h b/fs/xfs/xfs_file.h new file mode 100644 index 000000000000..2ad91f755caf --- /dev/null +++ b/fs/xfs/xfs_file.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_FILE_H__ +#define __XFS_FILE_H__ + +extern const struct file_operations xfs_file_operations; +extern const struct file_operations xfs_dir_file_operations; + +bool xfs_is_falloc_aligned(struct xfs_inode *ip, loff_t pos, + long long int len); + +#endif /* __XFS_FILE_H__ */ diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index de59eec74765..85dbb46452ca 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -533,7 +533,7 @@ xfs_getfsmap_rtdev_rtbitmap( trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb); trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb); - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); /* * Set up query parameters to return free rtextents covering the range @@ -557,7 +557,7 @@ xfs_getfsmap_rtdev_rtbitmap( if (error) goto err; err: - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); return error; } #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 83f708f62ed9..c211ea2b63c4 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -213,10 +213,8 @@ xfs_growfs_data_private( struct xfs_perag *pag; pag = xfs_perag_get(mp, id.agno); - error = xfs_ag_resv_free(pag); + xfs_ag_resv_free(pag); xfs_perag_put(pag); - if (error) - return error; } /* * Reserve AG metadata blocks. ENOSPC here does not mean there @@ -385,14 +383,14 @@ xfs_reserve_blocks( */ if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; - if (lcounter > 0) { /* release unused blocks */ + if (lcounter > 0) { /* release unused blocks */ fdblks_delta = lcounter; mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; if (fdblks_delta) { spin_unlock(&mp->m_sb_lock); - error = xfs_mod_fdblocks(mp, fdblks_delta, 0); + xfs_add_fdblocks(mp, fdblks_delta); spin_lock(&mp->m_sb_lock); } @@ -428,9 +426,9 @@ xfs_reserve_blocks( */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); - error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + error = xfs_dec_fdblocks(mp, fdblks_delta, 0); if (!error) - xfs_mod_fdblocks(mp, fdblks_delta, 0); + xfs_add_fdblocks(mp, fdblks_delta); spin_lock(&mp->m_sb_lock); } out: @@ -556,24 +554,13 @@ xfs_fs_reserve_ag_blocks( /* * Free space reserved for per-AG metadata. */ -int +void xfs_fs_unreserve_ag_blocks( struct xfs_mount *mp) { xfs_agnumber_t agno; struct xfs_perag *pag; - int error = 0; - int err2; - for_each_perag(mp, agno, pag) { - err2 = xfs_ag_resv_free(pag); - if (err2 && !error) - error = err2; - } - - if (error) - xfs_warn(mp, - "Error %d freeing per-AG metadata reserve pool.", error); - - return error; + for_each_perag(mp, agno, pag) + xfs_ag_resv_free(pag); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 44457b0a0593..3e2f73bcf831 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -12,6 +12,6 @@ int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); -int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); +void xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c new file mode 100644 index 000000000000..c8785ed59543 --- /dev/null +++ b/fs/xfs/xfs_handle.c @@ -0,0 +1,952 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2022-2024 Oracle. + * All rights reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_ioctl.h" +#include "xfs_parent.h" +#include "xfs_da_btree.h" +#include "xfs_handle.h" +#include "xfs_health.h" +#include "xfs_icache.h" +#include "xfs_export.h" +#include "xfs_xattr.h" +#include "xfs_acl.h" + +#include <linux/namei.h> + +static inline size_t +xfs_filehandle_fid_len(void) +{ + struct xfs_handle *handle = NULL; + + return sizeof(struct xfs_fid) - sizeof(handle->ha_fid.fid_len); +} + +static inline size_t +xfs_filehandle_init( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t gen, + struct xfs_handle *handle) +{ + memcpy(&handle->ha_fsid, mp->m_fixedfsid, sizeof(struct xfs_fsid)); + + handle->ha_fid.fid_len = xfs_filehandle_fid_len(); + handle->ha_fid.fid_pad = 0; + handle->ha_fid.fid_gen = gen; + handle->ha_fid.fid_ino = ino; + + return sizeof(struct xfs_handle); +} + +static inline size_t +xfs_fshandle_init( + struct xfs_mount *mp, + struct xfs_handle *handle) +{ + memcpy(&handle->ha_fsid, mp->m_fixedfsid, sizeof(struct xfs_fsid)); + memset(&handle->ha_fid, 0, sizeof(handle->ha_fid)); + + return sizeof(struct xfs_fsid); +} + +/* + * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to + * a file or fs handle. + * + * XFS_IOC_PATH_TO_FSHANDLE + * returns fs handle for a mount point or path within that mount point + * XFS_IOC_FD_TO_HANDLE + * returns full handle for a FD opened in user space + * XFS_IOC_PATH_TO_HANDLE + * returns full handle for a path + */ +int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq) +{ + int hsize; + xfs_handle_t handle; + struct inode *inode; + struct fd f = {NULL}; + struct path path; + int error; + struct xfs_inode *ip; + + if (cmd == XFS_IOC_FD_TO_HANDLE) { + f = fdget(hreq->fd); + if (!f.file) + return -EBADF; + inode = file_inode(f.file); + } else { + error = user_path_at(AT_FDCWD, hreq->path, 0, &path); + if (error) + return error; + inode = d_inode(path.dentry); + } + ip = XFS_I(inode); + + /* + * We can only generate handles for inodes residing on a XFS filesystem, + * and only for regular files, directories or symbolic links. + */ + error = -EINVAL; + if (inode->i_sb->s_magic != XFS_SB_MAGIC) + goto out_put; + + error = -EBADF; + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + goto out_put; + + + memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); + + if (cmd == XFS_IOC_PATH_TO_FSHANDLE) + hsize = xfs_fshandle_init(ip->i_mount, &handle); + else + hsize = xfs_filehandle_init(ip->i_mount, ip->i_ino, + inode->i_generation, &handle); + + error = -EFAULT; + if (copy_to_user(hreq->ohandle, &handle, hsize) || + copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) + goto out_put; + + error = 0; + + out_put: + if (cmd == XFS_IOC_FD_TO_HANDLE) + fdput(f); + else + path_put(&path); + return error; +} + +/* + * No need to do permission checks on the various pathname components + * as the handle operations are privileged. + */ +STATIC int +xfs_handle_acceptable( + void *context, + struct dentry *dentry) +{ + return 1; +} + +/* Convert handle already copied to kernel space into a dentry. */ +static struct dentry * +xfs_khandle_to_dentry( + struct file *file, + struct xfs_handle *handle) +{ + struct xfs_fid64 fid = { + .ino = handle->ha_fid.fid_ino, + .gen = handle->ha_fid.fid_gen, + }; + + /* + * Only allow handle opens under a directory. + */ + if (!S_ISDIR(file_inode(file)->i_mode)) + return ERR_PTR(-ENOTDIR); + + if (handle->ha_fid.fid_len != xfs_filehandle_fid_len()) + return ERR_PTR(-EINVAL); + + return exportfs_decode_fh(file->f_path.mnt, (struct fid *)&fid, 3, + FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG, + xfs_handle_acceptable, NULL); +} + +/* Convert handle already copied to kernel space into an xfs_inode. */ +static struct xfs_inode * +xfs_khandle_to_inode( + struct file *file, + struct xfs_handle *handle) +{ + struct xfs_inode *ip = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip->i_mount; + struct inode *inode; + + if (!S_ISDIR(VFS_I(ip)->i_mode)) + return ERR_PTR(-ENOTDIR); + + if (handle->ha_fid.fid_len != xfs_filehandle_fid_len()) + return ERR_PTR(-EINVAL); + + inode = xfs_nfs_get_inode(mp->m_super, handle->ha_fid.fid_ino, + handle->ha_fid.fid_gen); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return XFS_I(inode); +} + +/* + * Convert userspace handle data into a dentry. + */ +struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen) +{ + xfs_handle_t handle; + + if (hlen != sizeof(xfs_handle_t)) + return ERR_PTR(-EINVAL); + if (copy_from_user(&handle, uhandle, hlen)) + return ERR_PTR(-EFAULT); + + return xfs_khandle_to_dentry(parfilp, &handle); +} + +STATIC struct dentry * +xfs_handlereq_to_dentry( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen); +} + +int +xfs_open_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + const struct cred *cred = current_cred(); + int error; + int fd; + int permflag; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + fmode_t fmode; + struct path path; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + inode = d_inode(dentry); + + /* Restrict xfs_open_by_handle to directories & regular files. */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + error = -EPERM; + goto out_dput; + } + +#if BITS_PER_LONG != 32 + hreq->oflags |= O_LARGEFILE; +#endif + + permflag = hreq->oflags; + fmode = OPEN_FMODE(permflag); + if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && + (fmode & FMODE_WRITE) && IS_APPEND(inode)) { + error = -EPERM; + goto out_dput; + } + + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { + error = -EPERM; + goto out_dput; + } + + /* Can't write directories. */ + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { + error = -EISDIR; + goto out_dput; + } + + fd = get_unused_fd_flags(0); + if (fd < 0) { + error = fd; + goto out_dput; + } + + path.mnt = parfilp->f_path.mnt; + path.dentry = dentry; + filp = dentry_open(&path, hreq->oflags, cred); + dput(dentry); + if (IS_ERR(filp)) { + put_unused_fd(fd); + return PTR_ERR(filp); + } + + if (S_ISREG(inode->i_mode)) { + filp->f_flags |= O_NOATIME; + filp->f_mode |= FMODE_NOCMTIME; + } + + fd_install(fd, filp); + return fd; + + out_dput: + dput(dentry); + return error; +} + +int +xfs_readlink_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + struct dentry *dentry; + __u32 olen; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + /* Restrict this handle operation to symlinks only. */ + if (!d_is_symlink(dentry)) { + error = -EINVAL; + goto out_dput; + } + + if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { + error = -EFAULT; + goto out_dput; + } + + error = vfs_readlink(dentry, hreq->ohandle, olen); + + out_dput: + dput(dentry); + return error; +} + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +static void +xfs_ioc_attr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + void *value, + int valuelen) +{ + struct xfs_attrlist *alist = context->buffer; + struct xfs_attrlist_ent *aep; + int arraytop; + + ASSERT(!context->seen_enough); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + + /* decrement by the actual bytes used by the attr */ + context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) + + namelen + 1, sizeof(uint32_t)); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return; + } + + aep = context->buffer + context->firstu; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); +} + +static unsigned int +xfs_attr_filter( + u32 ioc_flags) +{ + if (ioc_flags & XFS_IOC_ATTR_ROOT) + return XFS_ATTR_ROOT; + if (ioc_flags & XFS_IOC_ATTR_SECURE) + return XFS_ATTR_SECURE; + return 0; +} + +static inline enum xfs_attr_update +xfs_xattr_flags( + u32 ioc_flags, + void *value) +{ + if (!value) + return XFS_ATTRUPDATE_REMOVE; + if (ioc_flags & XFS_IOC_ATTR_CREATE) + return XFS_ATTRUPDATE_CREATE; + if (ioc_flags & XFS_IOC_ATTR_REPLACE) + return XFS_ATTRUPDATE_REPLACE; + return XFS_ATTRUPDATE_UPSERT; +} + +int +xfs_ioc_attr_list( + struct xfs_inode *dp, + void __user *ubuf, + size_t bufsize, + int flags, + struct xfs_attrlist_cursor __user *ucursor) +{ + struct xfs_attr_list_context context = { }; + struct xfs_attrlist *alist; + void *buffer; + int error; + + if (bufsize < sizeof(struct xfs_attrlist) || + bufsize > XFS_XATTR_LIST_MAX) + return -EINVAL; + + /* + * Reject flags, only allow namespaces. + */ + if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) + return -EINVAL; + if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) + return -EINVAL; + + /* + * Validate the cursor. + */ + if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor))) + return -EFAULT; + if (context.cursor.pad1 || context.cursor.pad2) + return -EINVAL; + if (!context.cursor.initted && + (context.cursor.hashval || context.cursor.blkno || + context.cursor.offset)) + return -EINVAL; + + buffer = kvzalloc(bufsize, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + /* + * Initialize the output buffer. + */ + context.dp = dp; + context.resynch = 1; + context.attr_filter = xfs_attr_filter(flags); + context.buffer = buffer; + context.bufsize = round_down(bufsize, sizeof(uint32_t)); + context.firstu = context.bufsize; + context.put_listent = xfs_ioc_attr_put_listent; + + alist = context.buffer; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list(&context); + if (error) + goto out_free; + + if (copy_to_user(ubuf, buffer, bufsize) || + copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) + error = -EFAULT; +out_free: + kvfree(buffer); + return error; +} + +int +xfs_attrlist_by_handle( + struct file *parfilp, + struct xfs_fsop_attrlist_handlereq __user *p) +{ + struct xfs_fsop_attrlist_handlereq al_hreq; + struct dentry *dentry; + int error = -ENOMEM; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) + return -EFAULT; + + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer, + al_hreq.buflen, al_hreq.flags, &p->pos); + dput(dentry); + return error; +} + +static int +xfs_attrmulti_attr_get( + struct inode *inode, + unsigned char *name, + unsigned char __user *ubuf, + uint32_t *len, + uint32_t flags) +{ + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .name = name, + .namelen = strlen(name), + .valuelen = *len, + }; + int error; + + if (*len > XFS_XATTR_SIZE_MAX) + return -EINVAL; + + error = xfs_attr_get(&args); + if (error) + goto out_kfree; + + *len = args.valuelen; + if (copy_to_user(ubuf, args.value, args.valuelen)) + error = -EFAULT; + +out_kfree: + kvfree(args.value); + return error; +} + +static int +xfs_attrmulti_attr_set( + struct inode *inode, + unsigned char *name, + const unsigned char __user *ubuf, + uint32_t len, + uint32_t flags) +{ + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .name = name, + .namelen = strlen(name), + }; + int error; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + + if (ubuf) { + if (len > XFS_XATTR_SIZE_MAX) + return -EINVAL; + args.value = memdup_user(ubuf, len); + if (IS_ERR(args.value)) + return PTR_ERR(args.value); + args.valuelen = len; + } + + error = xfs_attr_change(&args, xfs_xattr_flags(flags, args.value)); + if (!error && (flags & XFS_IOC_ATTR_ROOT)) + xfs_forget_acl(inode, name); + kfree(args.value); + return error; +} + +int +xfs_ioc_attrmulti_one( + struct file *parfilp, + struct inode *inode, + uint32_t opcode, + void __user *uname, + void __user *value, + uint32_t *len, + uint32_t flags) +{ + unsigned char *name; + int error; + + if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE)) + return -EINVAL; + + name = strndup_user(uname, MAXNAMELEN); + if (IS_ERR(name)) + return PTR_ERR(name); + + switch (opcode) { + case ATTR_OP_GET: + error = xfs_attrmulti_attr_get(inode, name, value, len, flags); + break; + case ATTR_OP_REMOVE: + value = NULL; + *len = 0; + fallthrough; + case ATTR_OP_SET: + error = mnt_want_write_file(parfilp); + if (error) + break; + error = xfs_attrmulti_attr_set(inode, name, value, *len, flags); + mnt_drop_write_file(parfilp); + break; + default: + error = -EINVAL; + break; + } + + kfree(name); + return error; +} + +int +xfs_attrmulti_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + xfs_attr_multiop_t *ops; + xfs_fsop_attrmulti_handlereq_t am_hreq; + struct dentry *dentry; + unsigned int i, size; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) + return -EFAULT; + + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) + return -E2BIG; + + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = -E2BIG; + size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_dput; + + ops = memdup_user(am_hreq.ops, size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); + goto out_dput; + } + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, + d_inode(dentry), ops[i].am_opcode, + ops[i].am_attrname, ops[i].am_attrvalue, + &ops[i].am_length, ops[i].am_flags); + } + + if (copy_to_user(am_hreq.ops, ops, size)) + error = -EFAULT; + + kfree(ops); + out_dput: + dput(dentry); + return error; +} + +struct xfs_getparents_ctx { + struct xfs_attr_list_context context; + struct xfs_getparents_by_handle gph; + + /* File to target */ + struct xfs_inode *ip; + + /* Internal buffer where we format records */ + void *krecords; + + /* Last record filled out */ + struct xfs_getparents_rec *lastrec; + + unsigned int count; +}; + +static inline unsigned int +xfs_getparents_rec_sizeof( + unsigned int namelen) +{ + return round_up(sizeof(struct xfs_getparents_rec) + namelen + 1, + sizeof(uint64_t)); +} + +static void +xfs_getparents_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + void *value, + int valuelen) +{ + struct xfs_getparents_ctx *gpx = + container_of(context, struct xfs_getparents_ctx, context); + struct xfs_inode *ip = context->dp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_getparents *gp = &gpx->gph.gph_request; + struct xfs_getparents_rec *gpr = gpx->krecords + context->firstu; + unsigned short reclen = + xfs_getparents_rec_sizeof(namelen); + xfs_ino_t ino; + uint32_t gen; + int error; + + if (!(flags & XFS_ATTR_PARENT)) + return; + + error = xfs_parent_from_attr(mp, flags, name, namelen, value, valuelen, + &ino, &gen); + if (error) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_PARENT); + context->seen_enough = -EFSCORRUPTED; + return; + } + + /* + * We found a parent pointer, but we've filled up the buffer. Signal + * to the caller that we did /not/ reach the end of the parent pointer + * recordset. + */ + if (context->firstu > context->bufsize - reclen) { + context->seen_enough = 1; + return; + } + + /* Format the parent pointer directly into the caller buffer. */ + gpr->gpr_reclen = reclen; + xfs_filehandle_init(mp, ino, gen, &gpr->gpr_parent); + memcpy(gpr->gpr_name, name, namelen); + gpr->gpr_name[namelen] = 0; + + trace_xfs_getparents_put_listent(ip, gp, context, gpr); + + context->firstu += reclen; + gpx->count++; + gpx->lastrec = gpr; +} + +/* Expand the last record to fill the rest of the caller's buffer. */ +static inline void +xfs_getparents_expand_lastrec( + struct xfs_getparents_ctx *gpx) +{ + struct xfs_getparents *gp = &gpx->gph.gph_request; + struct xfs_getparents_rec *gpr = gpx->lastrec; + + if (!gpx->lastrec) + gpr = gpx->krecords; + + gpr->gpr_reclen = gp->gp_bufsize - ((void *)gpr - gpx->krecords); + + trace_xfs_getparents_expand_lastrec(gpx->ip, gp, &gpx->context, gpr); +} + +static inline void __user *u64_to_uptr(u64 val) +{ + return (void __user *)(uintptr_t)val; +} + +/* Retrieve the parent pointers for a given inode. */ +STATIC int +xfs_getparents( + struct xfs_getparents_ctx *gpx) +{ + struct xfs_getparents *gp = &gpx->gph.gph_request; + struct xfs_inode *ip = gpx->ip; + struct xfs_mount *mp = ip->i_mount; + size_t bufsize; + int error; + + /* Check size of buffer requested by user */ + if (gp->gp_bufsize > XFS_XATTR_LIST_MAX) + return -ENOMEM; + if (gp->gp_bufsize < xfs_getparents_rec_sizeof(1)) + return -EINVAL; + + if (gp->gp_iflags & ~XFS_GETPARENTS_IFLAGS_ALL) + return -EINVAL; + if (gp->gp_reserved) + return -EINVAL; + + bufsize = round_down(gp->gp_bufsize, sizeof(uint64_t)); + gpx->krecords = kvzalloc(bufsize, GFP_KERNEL); + if (!gpx->krecords) { + bufsize = min(bufsize, PAGE_SIZE); + gpx->krecords = kvzalloc(bufsize, GFP_KERNEL); + if (!gpx->krecords) + return -ENOMEM; + } + + gpx->context.dp = ip; + gpx->context.resynch = 1; + gpx->context.put_listent = xfs_getparents_put_listent; + gpx->context.bufsize = bufsize; + /* firstu is used to track the bytes filled in the buffer */ + gpx->context.firstu = 0; + + /* Copy the cursor provided by caller */ + memcpy(&gpx->context.cursor, &gp->gp_cursor, + sizeof(struct xfs_attrlist_cursor)); + gpx->count = 0; + gp->gp_oflags = 0; + + trace_xfs_getparents_begin(ip, gp, &gpx->context.cursor); + + error = xfs_attr_list(&gpx->context); + if (error) + goto out_free_buf; + if (gpx->context.seen_enough < 0) { + error = gpx->context.seen_enough; + goto out_free_buf; + } + xfs_getparents_expand_lastrec(gpx); + + /* Update the caller with the current cursor position */ + memcpy(&gp->gp_cursor, &gpx->context.cursor, + sizeof(struct xfs_attrlist_cursor)); + + /* Is this the root directory? */ + if (ip->i_ino == mp->m_sb.sb_rootino) + gp->gp_oflags |= XFS_GETPARENTS_OFLAG_ROOT; + + if (gpx->context.seen_enough == 0) { + /* + * If we did not run out of buffer space, then we reached the + * end of the pptr recordset, so set the DONE flag. + */ + gp->gp_oflags |= XFS_GETPARENTS_OFLAG_DONE; + } else if (gpx->count == 0) { + /* + * If we ran out of buffer space before copying any parent + * pointers at all, the caller's buffer was too short. Tell + * userspace that, erm, the message is too long. + */ + error = -EMSGSIZE; + goto out_free_buf; + } + + trace_xfs_getparents_end(ip, gp, &gpx->context.cursor); + + ASSERT(gpx->context.firstu <= gpx->gph.gph_request.gp_bufsize); + + /* Copy the records to userspace. */ + if (copy_to_user(u64_to_uptr(gpx->gph.gph_request.gp_buffer), + gpx->krecords, gpx->context.firstu)) + error = -EFAULT; + +out_free_buf: + kvfree(gpx->krecords); + gpx->krecords = NULL; + return error; +} + +/* Retrieve the parents of this file and pass them back to userspace. */ +int +xfs_ioc_getparents( + struct file *file, + struct xfs_getparents __user *ureq) +{ + struct xfs_getparents_ctx gpx = { + .ip = XFS_I(file_inode(file)), + }; + struct xfs_getparents *kreq = &gpx.gph.gph_request; + struct xfs_mount *mp = gpx.ip->i_mount; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!xfs_has_parent(mp)) + return -EOPNOTSUPP; + if (copy_from_user(kreq, ureq, sizeof(*kreq))) + return -EFAULT; + + error = xfs_getparents(&gpx); + if (error) + return error; + + if (copy_to_user(ureq, kreq, sizeof(*kreq))) + return -EFAULT; + + return 0; +} + +/* Retrieve the parents of this file handle and pass them back to userspace. */ +int +xfs_ioc_getparents_by_handle( + struct file *file, + struct xfs_getparents_by_handle __user *ureq) +{ + struct xfs_getparents_ctx gpx = { }; + struct xfs_inode *ip = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip->i_mount; + struct xfs_getparents_by_handle *kreq = &gpx.gph; + struct xfs_handle *handle = &kreq->gph_handle; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!xfs_has_parent(mp)) + return -EOPNOTSUPP; + if (copy_from_user(kreq, ureq, sizeof(*kreq))) + return -EFAULT; + + /* + * We don't use exportfs_decode_fh because it does too much work here. + * If the handle refers to a directory, the exportfs code will walk + * upwards through the directory tree to connect the dentries to the + * root directory dentry. For GETPARENTS we don't care about that + * because we're not actually going to open a file descriptor; we only + * want to open an inode and read its parent pointers. + * + * Note that xfs_scrub uses GETPARENTS to log that it will try to fix a + * corrupted file's metadata. For this usecase we would really rather + * userspace single-step the path reconstruction to avoid loops or + * other strange things if the directory tree is corrupt. + */ + gpx.ip = xfs_khandle_to_inode(file, handle); + if (IS_ERR(gpx.ip)) + return PTR_ERR(gpx.ip); + + error = xfs_getparents(&gpx); + if (error) + goto out_rele; + + if (copy_to_user(ureq, kreq, sizeof(*kreq))) + error = -EFAULT; + +out_rele: + xfs_irele(gpx.ip); + return error; +} diff --git a/fs/xfs/xfs_handle.h b/fs/xfs/xfs_handle.h new file mode 100644 index 000000000000..6799a86d8565 --- /dev/null +++ b/fs/xfs/xfs_handle.h @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2022-2024 Oracle. + * All rights reserved. + */ +#ifndef __XFS_HANDLE_H__ +#define __XFS_HANDLE_H__ + +int xfs_attrlist_by_handle(struct file *parfilp, + struct xfs_fsop_attrlist_handlereq __user *p); +int xfs_attrmulti_by_handle(struct file *parfilp, void __user *arg); + +int xfs_find_handle(unsigned int cmd, struct xfs_fsop_handlereq *hreq); +int xfs_open_by_handle(struct file *parfilp, struct xfs_fsop_handlereq *hreq); +int xfs_readlink_by_handle(struct file *parfilp, + struct xfs_fsop_handlereq *hreq); + +int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, + uint32_t opcode, void __user *uname, void __user *value, + uint32_t *len, uint32_t flags); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, + size_t bufsize, int flags, + struct xfs_attrlist_cursor __user *ucursor); + +struct dentry *xfs_handle_to_dentry(struct file *parfilp, void __user *uhandle, + u32 hlen); + +int xfs_ioc_getparents(struct file *file, struct xfs_getparents __user *arg); +int xfs_ioc_getparents_by_handle(struct file *file, + struct xfs_getparents_by_handle __user *arg); + +#endif /* __XFS_HANDLE_H__ */ diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index b39f959146bc..10f116d093a2 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -470,6 +470,7 @@ static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA }, { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR }, { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK }, + { XFS_SICK_INO_DIRTREE, XFS_BS_SICK_DIRTREE }, { 0, 0 }, }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 74f1812b03cb..0953163a2d84 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -613,7 +613,6 @@ xfs_iget_cache_miss( struct xfs_inode *ip; int error; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); - int iflags; ip = xfs_inode_alloc(mp, ino); if (!ip) @@ -693,13 +692,12 @@ xfs_iget_cache_miss( * memory barrier that ensures this detection works correctly at lookup * time. */ - iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) d_mark_dontcache(VFS_I(ip)); ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; - xfs_iflags_set(ip, iflags); + xfs_iflags_set(ip, XFS_INEW); /* insert the new inode */ spin_lock(&pag->pag_ici_lock); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d55b42b2480d..58fb7a5062e1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -16,6 +16,7 @@ #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_attr.h" +#include "xfs_bit.h" #include "xfs_trans_space.h" #include "xfs_trans.h" #include "xfs_buf_item.h" @@ -38,13 +39,12 @@ #include "xfs_ag.h" #include "xfs_log_priv.h" #include "xfs_health.h" +#include "xfs_pnfs.h" +#include "xfs_parent.h" +#include "xfs_xattr.h" struct kmem_cache *xfs_inode_cache; -STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); -STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, - struct xfs_inode *); - /* * helper function to extract extent size hint from inode */ @@ -60,7 +60,8 @@ xfs_get_extsz_hint( return 0; if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) return ip->i_extsize; - if (XFS_IS_REALTIME_INODE(ip)) + if (XFS_IS_REALTIME_INODE(ip) && + ip->i_mount->m_sb.sb_rextsize > 1) return ip->i_mount->m_sb.sb_rextsize; return 0; } @@ -420,7 +421,7 @@ xfs_lock_inumorder( * lock more than one at a time, lockdep will report false positives saying we * have violated locking orders. */ -static void +void xfs_lock_inodes( struct xfs_inode **ips, int inodes, @@ -749,6 +750,8 @@ xfs_inode_inherit_flags2( /* * Initialise a newly allocated inode and return the in-core inode to the * caller locked exclusively. + * + * Caller is responsible for unlocking the inode manually upon return */ int xfs_init_new_inode( @@ -875,7 +878,7 @@ xfs_init_new_inode( /* * Log the new values stuffed into the inode. */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, flags); /* now that we have an i_mode we can setup the inode structure */ @@ -890,24 +893,27 @@ xfs_init_new_inode( * link count to go to zero, move the inode to AGI unlinked list so that it can * be freed when the last active reference goes away via xfs_inactive(). */ -static int /* error */ +int xfs_droplink( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - if (VFS_I(ip)->i_nlink == 0) { - xfs_alert(ip->i_mount, - "%s: Attempt to drop inode (%llu) with nlink zero.", - __func__, ip->i_ino); - return -EFSCORRUPTED; - } + struct inode *inode = VFS_I(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - drop_nlink(VFS_I(ip)); + if (inode->i_nlink == 0) { + xfs_info_ratelimited(tp->t_mountp, + "Inode 0x%llx link count dropped below zero. Pinning link count.", + ip->i_ino); + set_nlink(inode, XFS_NLINK_PINNED); + } + if (inode->i_nlink != XFS_NLINK_PINNED) + drop_nlink(inode); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (VFS_I(ip)->i_nlink) + if (inode->i_nlink) return 0; return xfs_iunlink(tp, ip); @@ -916,14 +922,22 @@ xfs_droplink( /* * Increment the link count on an inode & log the change. */ -static void +void xfs_bumplink( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { + struct inode *inode = VFS_I(ip); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - inc_nlink(VFS_I(ip)); + if (inode->i_nlink == XFS_NLINK_PINNED - 1) + xfs_info_ratelimited(tp->t_mountp, + "Inode 0x%llx link count exceeded maximum. Pinning link count.", + ip->i_ino); + if (inode->i_nlink != XFS_NLINK_PINNED) + inc_nlink(inode); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -1005,7 +1019,7 @@ xfs_dir_hook_setup( int xfs_create( struct mnt_idmap *idmap, - xfs_inode_t *dp, + struct xfs_inode *dp, struct xfs_name *name, umode_t mode, dev_t rdev, @@ -1017,7 +1031,7 @@ xfs_create( struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; int error; - bool unlock_dp_on_error = false; + bool unlock_dp_on_error = false; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1025,6 +1039,7 @@ xfs_create( struct xfs_trans_res *tres; uint resblks; xfs_ino_t ino; + struct xfs_parent_args *ppargs; trace_xfs_create(dp, name); @@ -1046,13 +1061,17 @@ xfs_create( return error; if (is_dir) { - resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + resblks = xfs_mkdir_space_res(mp, name->len); tres = &M_RES(mp)->tr_mkdir; } else { - resblks = XFS_CREATE_SPACE_RES(mp, name->len); + resblks = xfs_create_space_res(mp, name->len); tres = &M_RES(mp)->tr_create; } + error = xfs_parent_start(mp, &ppargs); + if (error) + goto out_release_dquots; + /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not @@ -1068,7 +1087,7 @@ xfs_create( resblks, &tp); } if (error) - goto out_release_dquots; + goto out_parent; xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -1092,8 +1111,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = false; + xfs_trans_ijoin(tp, dp, 0); error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks - XFS_IALLOC_SPACE_RES(mp)); @@ -1113,6 +1131,16 @@ xfs_create( } /* + * If we have parent pointers, we need to add the attribute containing + * the parent information now. + */ + if (ppargs) { + error = xfs_parent_addname(tp, ppargs, dp, name, ip); + if (error) + goto out_trans_cancel; + } + + /* * Create ip with a reference from dp, and add '.' and '..' references * if it's a directory. */ @@ -1142,6 +1170,9 @@ xfs_create( xfs_qm_dqrele(pdqp); *ipp = ip; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_parent_finish(mp, ppargs); return 0; out_trans_cancel: @@ -1153,9 +1184,12 @@ xfs_create( * transactions and deadlocks from xfs_inactive. */ if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_finish_inode_setup(ip); xfs_irele(ip); } + out_parent: + xfs_parent_finish(mp, ppargs); out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -1171,6 +1205,7 @@ xfs_create_tmpfile( struct mnt_idmap *idmap, struct xfs_inode *dp, umode_t mode, + bool init_xattrs, struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; @@ -1211,7 +1246,7 @@ xfs_create_tmpfile( error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (!error) error = xfs_init_new_inode(idmap, tp, dp, ino, mode, - 0, 0, prid, false, &ip); + 0, 0, prid, init_xattrs, &ip); if (error) goto out_trans_cancel; @@ -1238,6 +1273,7 @@ xfs_create_tmpfile( xfs_qm_dqrele(pdqp); *ipp = ip; + xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; out_trans_cancel: @@ -1249,6 +1285,7 @@ xfs_create_tmpfile( * transactions and deadlocks from xfs_inactive. */ if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_finish_inode_setup(ip); xfs_irele(ip); } @@ -1262,14 +1299,15 @@ xfs_create_tmpfile( int xfs_link( - xfs_inode_t *tdp, - xfs_inode_t *sip, + struct xfs_inode *tdp, + struct xfs_inode *sip, struct xfs_name *target_name) { - xfs_mount_t *mp = tdp->i_mount; - xfs_trans_t *tp; + struct xfs_mount *mp = tdp->i_mount; + struct xfs_trans *tp; int error, nospace_error = 0; int resblks; + struct xfs_parent_args *ppargs; trace_xfs_link(tdp, target_name); @@ -1288,11 +1326,25 @@ xfs_link( if (error) goto std_return; - resblks = XFS_LINK_SPACE_RES(mp, target_name->len); + error = xfs_parent_start(mp, &ppargs); + if (error) + goto std_return; + + resblks = xfs_link_space_res(mp, target_name->len); error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks, &tp, &nospace_error); if (error) - goto std_return; + goto out_parent; + + /* + * We don't allow reservationless or quotaless hardlinking when parent + * pointers are enabled because we can't back out if the xattrs must + * grow. + */ + if (ppargs && nospace_error) { + error = nospace_error; + goto error_return; + } /* * If we are using project inheritance, we only allow hard link @@ -1343,6 +1395,19 @@ xfs_link( xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); xfs_bumplink(tp, sip); + + /* + * If we have parent pointers, we now need to add the parent record to + * the attribute fork of the inode. If this is the initial parent + * attribute, we need to create it correctly, otherwise we can just add + * the parent to the inode. + */ + if (ppargs) { + error = xfs_parent_addname(tp, ppargs, tdp, target_name, sip); + if (error) + goto error_return; + } + xfs_dir_update_hook(tdp, sip, 1, target_name); /* @@ -1353,10 +1418,18 @@ xfs_link( if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); + error = xfs_trans_commit(tp); + xfs_iunlock(tdp, XFS_ILOCK_EXCL); + xfs_iunlock(sip, XFS_ILOCK_EXCL); + xfs_parent_finish(mp, ppargs); + return error; error_return: xfs_trans_cancel(tp); + xfs_iunlock(tdp, XFS_ILOCK_EXCL); + xfs_iunlock(sip, XFS_ILOCK_EXCL); + out_parent: + xfs_parent_finish(mp, ppargs); std_return: if (error == -ENOSPC && nospace_error) error = nospace_error; @@ -1555,6 +1628,51 @@ out_unlock: } /* + * Mark all the buffers attached to this directory stale. In theory we should + * never be freeing a directory with any blocks at all, but this covers the + * case where we've recovered a directory swap with a "temporary" directory + * created by online repair and now need to dump it. + */ +STATIC void +xfs_inactive_dir( + struct xfs_inode *dp) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); + xfs_fileoff_t off; + + /* + * Invalidate each directory block. All directory blocks are of + * fsbcount length and alignment, so we only need to walk those same + * offsets. We hold the only reference to this inode, so we must wait + * for the buffer locks. + */ + for_each_xfs_iext(ifp, &icur, &got) { + for (off = round_up(got.br_startoff, geo->fsbcount); + off < got.br_startoff + got.br_blockcount; + off += geo->fsbcount) { + struct xfs_buf *bp = NULL; + xfs_fsblock_t fsbno; + int error; + + fsbno = (off - got.br_startoff) + got.br_startblock; + error = xfs_buf_incore(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsbno), + XFS_FSB_TO_BB(mp, geo->fsbcount), + XBF_LIVESCAN, &bp); + if (error) + continue; + + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + } +} + +/* * xfs_inactive_truncate * * Called to perform a truncate when an inode becomes unlinked. @@ -1864,6 +1982,11 @@ xfs_inactive( goto out; } + if (S_ISDIR(VFS_I(ip)->i_mode) && ip->i_df.if_nextents > 0) { + xfs_inactive_dir(ip); + truncate = 1; + } + if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); else if (truncate) @@ -1937,7 +2060,7 @@ out: * only unlinked, referenced inodes can be on the unlinked inode list. If we * don't find the inode in cache, then let the caller handle the situation. */ -static struct xfs_inode * +struct xfs_inode * xfs_iunlink_lookup( struct xfs_perag *pag, xfs_agino_t agino) @@ -2150,7 +2273,7 @@ xfs_iunlink_insert_inode( * We place the on-disk inode on a list in the AGI. It will be pulled from this * list when the inode is freed. */ -STATIC int +int xfs_iunlink( struct xfs_trans *tp, struct xfs_inode *ip) @@ -2167,7 +2290,7 @@ xfs_iunlink( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, 0, &agibp); if (error) goto out; @@ -2252,7 +2375,7 @@ xfs_iunlink_remove_inode( /* * Pull the on-disk inode from the AGI unlinked list. */ -STATIC int +int xfs_iunlink_remove( struct xfs_trans *tp, struct xfs_perag *pag, @@ -2264,7 +2387,7 @@ xfs_iunlink_remove( trace_xfs_iunlink_remove(ip); /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, 0, &agibp); if (error) return error; @@ -2598,16 +2721,17 @@ xfs_iunpin_wait( */ int xfs_remove( - xfs_inode_t *dp, + struct xfs_inode *dp, struct xfs_name *name, - xfs_inode_t *ip) + struct xfs_inode *ip) { - xfs_mount_t *mp = dp->i_mount; - xfs_trans_t *tp = NULL; + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = NULL; int is_dir = S_ISDIR(VFS_I(ip)->i_mode); int dontcare; int error = 0; uint resblks; + struct xfs_parent_args *ppargs; trace_xfs_remove(dp, name); @@ -2624,6 +2748,10 @@ xfs_remove( if (error) goto std_return; + error = xfs_parent_start(mp, &ppargs); + if (error) + goto std_return; + /* * We try to get the real space reservation first, allowing for * directory btree deletion(s) implying possible bmap insert(s). If we @@ -2635,12 +2763,12 @@ xfs_remove( * the directory code can handle a reservationless update and we don't * want to prevent a user from trying to free space by deleting things. */ - resblks = XFS_REMOVE_SPACE_RES(mp); + resblks = xfs_remove_space_res(mp, name->len); error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks, &tp, &dontcare); if (error) { ASSERT(error != -ENOSPC); - goto std_return; + goto out_parent; } /* @@ -2700,6 +2828,13 @@ xfs_remove( goto out_trans_cancel; } + /* Remove parent pointer. */ + if (ppargs) { + error = xfs_parent_removename(tp, ppargs, dp, name, ip); + if (error) + goto out_trans_cancel; + } + /* * Drop the link from dp to ip, and if ip was a directory, remove the * '.' and '..' references since we freed the directory. @@ -2716,19 +2851,42 @@ xfs_remove( error = xfs_trans_commit(tp); if (error) - goto std_return; + goto out_unlock; if (is_dir && xfs_inode_is_filestream(ip)) xfs_filestream_deassociate(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_parent_finish(mp, ppargs); return 0; out_trans_cancel: xfs_trans_cancel(tp); + out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + out_parent: + xfs_parent_finish(mp, ppargs); std_return: return error; } +static inline void +xfs_iunlock_rename( + struct xfs_inode **i_tab, + int num_inodes) +{ + int i; + + for (i = num_inodes - 1; i >= 0; i--) { + /* Skip duplicate inodes if src and target dps are the same */ + if (!i_tab[i] || (i > 0 && i_tab[i] == i_tab[i - 1])) + continue; + xfs_iunlock(i_tab[i], XFS_ILOCK_EXCL); + } +} + /* * Enter all inodes for a rename transaction into a sorted array. */ @@ -2743,7 +2901,7 @@ xfs_sort_for_rename( struct xfs_inode **i_tab,/* out: sorted array of inodes */ int *num_inodes) /* in/out: inodes in array */ { - int i, j; + int i; ASSERT(*num_inodes == __XFS_SORT_INODES); memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); @@ -2765,17 +2923,26 @@ xfs_sort_for_rename( i_tab[i++] = wip; *num_inodes = i; + xfs_sort_inodes(i_tab, *num_inodes); +} + +void +xfs_sort_inodes( + struct xfs_inode **i_tab, + unsigned int num_inodes) +{ + int i, j; + + ASSERT(num_inodes <= __XFS_SORT_INODES); + /* * Sort the elements via bubble sort. (Remember, there are at * most 5 elements to sort, so this is adequate.) */ - for (i = 0; i < *num_inodes; i++) { - for (j = 1; j < *num_inodes; j++) { - if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { - struct xfs_inode *temp = i_tab[j]; - i_tab[j] = i_tab[j-1]; - i_tab[j-1] = temp; - } + for (i = 0; i < num_inodes; i++) { + for (j = 1; j < num_inodes; j++) { + if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) + swap(i_tab[j], i_tab[j - 1]); } } } @@ -2805,15 +2972,17 @@ xfs_cross_rename( struct xfs_inode *dp1, struct xfs_name *name1, struct xfs_inode *ip1, + struct xfs_parent_args *ip1_ppargs, struct xfs_inode *dp2, struct xfs_name *name2, struct xfs_inode *ip2, + struct xfs_parent_args *ip2_ppargs, int spaceres) { - int error = 0; - int ip1_flags = 0; - int ip2_flags = 0; - int dp2_flags = 0; + int error = 0; + int ip1_flags = 0; + int ip2_flags = 0; + int dp2_flags = 0; /* Swap inode number for dirent in first parent */ error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); @@ -2882,6 +3051,21 @@ xfs_cross_rename( } } + /* Schedule parent pointer replacements */ + if (ip1_ppargs) { + error = xfs_parent_replacename(tp, ip1_ppargs, dp1, name1, dp2, + name2, ip1); + if (error) + goto out_trans_abort; + } + + if (ip2_ppargs) { + error = xfs_parent_replacename(tp, ip2_ppargs, dp2, name2, dp1, + name1, ip2); + if (error) + goto out_trans_abort; + } + if (ip1_flags) { xfs_trans_ichgtime(tp, ip1, ip1_flags); xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); @@ -2937,7 +3121,7 @@ xfs_rename_alloc_whiteout( int error; error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, - &tmpfile); + xfs_has_parent(dp->i_mount), &tmpfile); if (error) return error; @@ -2981,6 +3165,9 @@ xfs_rename( struct xfs_trans *tp; struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; + struct xfs_parent_args *src_ppargs = NULL; + struct xfs_parent_args *tgt_ppargs = NULL; + struct xfs_parent_args *wip_ppargs = NULL; int i; int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); @@ -3012,9 +3199,26 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); + error = xfs_parent_start(mp, &src_ppargs); + if (error) + goto out_release_wip; + + if (wip) { + error = xfs_parent_start(mp, &wip_ppargs); + if (error) + goto out_src_ppargs; + } + + if (target_ip) { + error = xfs_parent_start(mp, &tgt_ppargs); + if (error) + goto out_wip_ppargs; + } + retry: nospace_error = 0; - spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); + spaceres = xfs_rename_space_res(mp, src_name->len, target_ip != NULL, + target_name->len, wip != NULL); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { nospace_error = error; @@ -3023,14 +3227,26 @@ retry: &tp); } if (error) - goto out_release_wip; + goto out_tgt_ppargs; + + /* + * We don't allow reservationless renaming when parent pointers are + * enabled because we can't back out if the xattrs must grow. + */ + if (src_ppargs && nospace_error) { + error = nospace_error; + xfs_trans_cancel(tp); + goto out_tgt_ppargs; + } /* * Attach the dquots to the inodes */ error = xfs_qm_vop_rename_dqattach(inodes); - if (error) - goto out_trans_cancel; + if (error) { + xfs_trans_cancel(tp); + goto out_tgt_ppargs; + } /* * Lock all the participating inodes. Depending upon whether @@ -3041,18 +3257,16 @@ retry: xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* - * Join all the inodes to the transaction. From this point on, - * we can rely on either trans_commit or trans_cancel to unlock - * them. + * Join all the inodes to the transaction. */ - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, 0); if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, 0); + xfs_trans_ijoin(tp, src_ip, 0); if (target_ip) - xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_ip, 0); if (wip) - xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, wip, 0); /* * If we are using project inheritance, we only allow renames @@ -3066,10 +3280,13 @@ retry: } /* RENAME_EXCHANGE is unique from here on. */ - if (flags & RENAME_EXCHANGE) - return xfs_cross_rename(tp, src_dp, src_name, src_ip, - target_dp, target_name, target_ip, - spaceres); + if (flags & RENAME_EXCHANGE) { + error = xfs_cross_rename(tp, src_dp, src_name, src_ip, + src_ppargs, target_dp, target_name, target_ip, + tgt_ppargs, spaceres); + nospace_error = 0; + goto out_unlock; + } /* * Try to reserve quota to handle an expansion of the target directory. @@ -3083,6 +3300,7 @@ retry: if (error == -EDQUOT || error == -ENOSPC) { if (!retried) { xfs_trans_cancel(tp); + xfs_iunlock_rename(inodes, num_inodes); xfs_blockgc_free_quota(target_dp, 0); retried = true; goto retry; @@ -3097,6 +3315,15 @@ retry: } /* + * We don't allow quotaless renaming when parent pointers are enabled + * because we can't back out if the xattrs must grow. + */ + if (src_ppargs && nospace_error) { + error = nospace_error; + goto out_trans_cancel; + } + + /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. */ @@ -3142,7 +3369,7 @@ retry: pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inodes[i]->i_ino)); - error = xfs_read_agi(pag, tp, &bp); + error = xfs_read_agi(pag, tp, 0, &bp); xfs_perag_put(pag); if (error) goto out_trans_cancel; @@ -3288,6 +3515,28 @@ retry: if (error) goto out_trans_cancel; + /* Schedule parent pointer updates. */ + if (wip_ppargs) { + error = xfs_parent_addname(tp, wip_ppargs, src_dp, src_name, + wip); + if (error) + goto out_trans_cancel; + } + + if (src_ppargs) { + error = xfs_parent_replacename(tp, src_ppargs, src_dp, + src_name, target_dp, target_name, src_ip); + if (error) + goto out_trans_cancel; + } + + if (tgt_ppargs) { + error = xfs_parent_removename(tp, tgt_ppargs, target_dp, + target_name, target_ip); + if (error) + goto out_trans_cancel; + } + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); if (new_parent) @@ -3309,12 +3558,19 @@ retry: xfs_dir_update_hook(src_dp, wip, 1, src_name); error = xfs_finish_rename(tp); - if (wip) - xfs_irele(wip); - return error; + nospace_error = 0; + goto out_unlock; out_trans_cancel: xfs_trans_cancel(tp); +out_unlock: + xfs_iunlock_rename(inodes, num_inodes); +out_tgt_ppargs: + xfs_parent_finish(mp, tgt_ppargs); +out_wip_ppargs: + xfs_parent_finish(mp, wip_ppargs); +out_src_ppargs: + xfs_parent_finish(mp, src_ppargs); out_release_wip: if (wip) xfs_irele(wip); @@ -3814,7 +4070,7 @@ xfs_inode_reload_unlinked_bucket( /* Grab the first inode in the list */ pag = xfs_perag_get(mp, agno); - error = xfs_ialloc_read_agi(pag, tp, &agibp); + error = xfs_ialloc_read_agi(pag, tp, 0, &agibp); xfs_perag_put(pag); if (error) return error; @@ -3946,3 +4202,77 @@ xfs_inode_count_blocks( xfs_bmap_count_leaves(ifp, rblocks); *dblocks = ip->i_nblocks - *rblocks; } + +static void +xfs_wait_dax_page( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + schedule(); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); +} + +int +xfs_break_dax_layouts( + struct inode *inode, + bool *retry) +{ + struct page *page; + + xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL); + + page = dax_layout_busy_page(inode->i_mapping); + if (!page) + return 0; + + *retry = true; + return ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, + 0, 0, xfs_wait_dax_page(inode)); +} + +int +xfs_break_layouts( + struct inode *inode, + uint *iolock, + enum layout_break_reason reason) +{ + bool retry; + int error; + + xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); + + do { + retry = false; + switch (reason) { + case BREAK_UNMAP: + error = xfs_break_dax_layouts(inode, &retry); + if (error || retry) + break; + fallthrough; + case BREAK_WRITE: + error = xfs_break_leased_layouts(inode, iolock, &retry); + break; + default: + WARN_ON_ONCE(1); + error = -EINVAL; + } + } while (error == 0 && retry); + + return error; +} + +/* Returns the size of fundamental allocation unit for a file, in bytes. */ +unsigned int +xfs_inode_alloc_unitsize( + struct xfs_inode *ip) +{ + unsigned int blocks = 1; + + if (XFS_IS_REALTIME_INODE(ip)) + blocks = ip->i_mount->m_sb.sb_rextsize; + + return XFS_FSB_TO_B(ip->i_mount, blocks); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ab46ffb3ac19..292b90b5f2ac 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -207,13 +207,13 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) * i_flags helper functions */ static inline void -__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +__xfs_iflags_set(xfs_inode_t *ip, unsigned long flags) { ip->i_flags |= flags; } static inline void -xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +xfs_iflags_set(xfs_inode_t *ip, unsigned long flags) { spin_lock(&ip->i_flags_lock); __xfs_iflags_set(ip, flags); @@ -221,7 +221,7 @@ xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) } static inline void -xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags) +xfs_iflags_clear(xfs_inode_t *ip, unsigned long flags) { spin_lock(&ip->i_flags_lock); ip->i_flags &= ~flags; @@ -229,13 +229,13 @@ xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags) } static inline int -__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +__xfs_iflags_test(xfs_inode_t *ip, unsigned long flags) { return (ip->i_flags & flags); } static inline int -xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +xfs_iflags_test(xfs_inode_t *ip, unsigned long flags) { int ret; spin_lock(&ip->i_flags_lock); @@ -245,7 +245,7 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) } static inline int -xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) +xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned long flags) { int ret; @@ -258,7 +258,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) } static inline int -xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) +xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags) { int ret; @@ -312,6 +312,15 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) } /* + * Decide if this file is a realtime file whose data allocation unit is larger + * than a single filesystem block. + */ +static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) +{ + return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1; +} + +/* * Return the buftarg used for data allocations on a given inode. */ #define xfs_inode_buftarg(ip) \ @@ -513,7 +522,7 @@ int xfs_create(struct mnt_idmap *idmap, umode_t mode, dev_t rdev, bool need_xattr, struct xfs_inode **ipp); int xfs_create_tmpfile(struct mnt_idmap *idmap, - struct xfs_inode *dp, umode_t mode, + struct xfs_inode *dp, umode_t mode, bool init_xattrs, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); @@ -565,16 +574,10 @@ xfs_itruncate_extents( return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0); } -/* from xfs_file.c */ int xfs_break_dax_layouts(struct inode *inode, bool *retry); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); -/* from xfs_iops.c */ -extern void xfs_setup_inode(struct xfs_inode *ip); -extern void xfs_setup_iops(struct xfs_inode *ip); -extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); - static inline void xfs_update_stable_writes(struct xfs_inode *ip) { if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev)) @@ -613,11 +616,20 @@ extern struct kmem_cache *xfs_inode_cache; bool xfs_inode_needs_inactive(struct xfs_inode *ip); +int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip); +int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_inode *ip); +struct xfs_inode *xfs_iunlink_lookup(struct xfs_perag *pag, xfs_agino_t agino); + void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); +int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip); +void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip); +void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode); +void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes); static inline bool xfs_inode_unlinked_incomplete( @@ -631,6 +643,7 @@ int xfs_inode_reload_unlinked(struct xfs_inode *ip); bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, xfs_filblks_t *dblocks, xfs_filblks_t *rblocks); +unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip); struct xfs_dir_update_params { const struct xfs_inode *dp; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d0e2cec6210d..f0117188f302 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -23,11 +23,9 @@ #include "xfs_fsops.h" #include "xfs_discard.h" #include "xfs_quota.h" -#include "xfs_export.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_trans.h" -#include "xfs_acl.h" #include "xfs_btree.h" #include <linux/fsmap.h> #include "xfs_fsmap.h" @@ -39,596 +37,13 @@ #include "xfs_ioctl.h" #include "xfs_xattr.h" #include "xfs_rtbitmap.h" +#include "xfs_file.h" +#include "xfs_exchrange.h" +#include "xfs_handle.h" #include <linux/mount.h> -#include <linux/namei.h> #include <linux/fileattr.h> -/* - * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to - * a file or fs handle. - * - * XFS_IOC_PATH_TO_FSHANDLE - * returns fs handle for a mount point or path within that mount point - * XFS_IOC_FD_TO_HANDLE - * returns full handle for a FD opened in user space - * XFS_IOC_PATH_TO_HANDLE - * returns full handle for a path - */ -int -xfs_find_handle( - unsigned int cmd, - xfs_fsop_handlereq_t *hreq) -{ - int hsize; - xfs_handle_t handle; - struct inode *inode; - struct fd f = {NULL}; - struct path path; - int error; - struct xfs_inode *ip; - - if (cmd == XFS_IOC_FD_TO_HANDLE) { - f = fdget(hreq->fd); - if (!f.file) - return -EBADF; - inode = file_inode(f.file); - } else { - error = user_path_at(AT_FDCWD, hreq->path, 0, &path); - if (error) - return error; - inode = d_inode(path.dentry); - } - ip = XFS_I(inode); - - /* - * We can only generate handles for inodes residing on a XFS filesystem, - * and only for regular files, directories or symbolic links. - */ - error = -EINVAL; - if (inode->i_sb->s_magic != XFS_SB_MAGIC) - goto out_put; - - error = -EBADF; - if (!S_ISREG(inode->i_mode) && - !S_ISDIR(inode->i_mode) && - !S_ISLNK(inode->i_mode)) - goto out_put; - - - memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); - - if (cmd == XFS_IOC_PATH_TO_FSHANDLE) { - /* - * This handle only contains an fsid, zero the rest. - */ - memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); - hsize = sizeof(xfs_fsid_t); - } else { - handle.ha_fid.fid_len = sizeof(xfs_fid_t) - - sizeof(handle.ha_fid.fid_len); - handle.ha_fid.fid_pad = 0; - handle.ha_fid.fid_gen = inode->i_generation; - handle.ha_fid.fid_ino = ip->i_ino; - hsize = sizeof(xfs_handle_t); - } - - error = -EFAULT; - if (copy_to_user(hreq->ohandle, &handle, hsize) || - copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) - goto out_put; - - error = 0; - - out_put: - if (cmd == XFS_IOC_FD_TO_HANDLE) - fdput(f); - else - path_put(&path); - return error; -} - -/* - * No need to do permission checks on the various pathname components - * as the handle operations are privileged. - */ -STATIC int -xfs_handle_acceptable( - void *context, - struct dentry *dentry) -{ - return 1; -} - -/* - * Convert userspace handle data into a dentry. - */ -struct dentry * -xfs_handle_to_dentry( - struct file *parfilp, - void __user *uhandle, - u32 hlen) -{ - xfs_handle_t handle; - struct xfs_fid64 fid; - - /* - * Only allow handle opens under a directory. - */ - if (!S_ISDIR(file_inode(parfilp)->i_mode)) - return ERR_PTR(-ENOTDIR); - - if (hlen != sizeof(xfs_handle_t)) - return ERR_PTR(-EINVAL); - if (copy_from_user(&handle, uhandle, hlen)) - return ERR_PTR(-EFAULT); - if (handle.ha_fid.fid_len != - sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len)) - return ERR_PTR(-EINVAL); - - memset(&fid, 0, sizeof(struct fid)); - fid.ino = handle.ha_fid.fid_ino; - fid.gen = handle.ha_fid.fid_gen; - - return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3, - FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG, - xfs_handle_acceptable, NULL); -} - -STATIC struct dentry * -xfs_handlereq_to_dentry( - struct file *parfilp, - xfs_fsop_handlereq_t *hreq) -{ - return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen); -} - -int -xfs_open_by_handle( - struct file *parfilp, - xfs_fsop_handlereq_t *hreq) -{ - const struct cred *cred = current_cred(); - int error; - int fd; - int permflag; - struct file *filp; - struct inode *inode; - struct dentry *dentry; - fmode_t fmode; - struct path path; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - dentry = xfs_handlereq_to_dentry(parfilp, hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - inode = d_inode(dentry); - - /* Restrict xfs_open_by_handle to directories & regular files. */ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - error = -EPERM; - goto out_dput; - } - -#if BITS_PER_LONG != 32 - hreq->oflags |= O_LARGEFILE; -#endif - - permflag = hreq->oflags; - fmode = OPEN_FMODE(permflag); - if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && - (fmode & FMODE_WRITE) && IS_APPEND(inode)) { - error = -EPERM; - goto out_dput; - } - - if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - error = -EPERM; - goto out_dput; - } - - /* Can't write directories. */ - if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { - error = -EISDIR; - goto out_dput; - } - - fd = get_unused_fd_flags(0); - if (fd < 0) { - error = fd; - goto out_dput; - } - - path.mnt = parfilp->f_path.mnt; - path.dentry = dentry; - filp = dentry_open(&path, hreq->oflags, cred); - dput(dentry); - if (IS_ERR(filp)) { - put_unused_fd(fd); - return PTR_ERR(filp); - } - - if (S_ISREG(inode->i_mode)) { - filp->f_flags |= O_NOATIME; - filp->f_mode |= FMODE_NOCMTIME; - } - - fd_install(fd, filp); - return fd; - - out_dput: - dput(dentry); - return error; -} - -int -xfs_readlink_by_handle( - struct file *parfilp, - xfs_fsop_handlereq_t *hreq) -{ - struct dentry *dentry; - __u32 olen; - int error; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - dentry = xfs_handlereq_to_dentry(parfilp, hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - /* Restrict this handle operation to symlinks only. */ - if (!d_is_symlink(dentry)) { - error = -EINVAL; - goto out_dput; - } - - if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { - error = -EFAULT; - goto out_dput; - } - - error = vfs_readlink(dentry, hreq->ohandle, olen); - - out_dput: - dput(dentry); - return error; -} - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -static void -xfs_ioc_attr_put_listent( - struct xfs_attr_list_context *context, - int flags, - unsigned char *name, - int namelen, - int valuelen) -{ - struct xfs_attrlist *alist = context->buffer; - struct xfs_attrlist_ent *aep; - int arraytop; - - ASSERT(!context->seen_enough); - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*alist)); - ASSERT(context->firstu <= context->bufsize); - - /* - * Only list entries in the right namespace. - */ - if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK)) - return; - - arraytop = sizeof(*alist) + - context->count * sizeof(alist->al_offset[0]); - - /* decrement by the actual bytes used by the attr */ - context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) + - namelen + 1, sizeof(uint32_t)); - if (context->firstu < arraytop) { - trace_xfs_attr_list_full(context); - alist->al_more = 1; - context->seen_enough = 1; - return; - } - - aep = context->buffer + context->firstu; - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[namelen] = 0; - alist->al_offset[context->count++] = context->firstu; - alist->al_count = context->count; - trace_xfs_attr_list_add(context); -} - -static unsigned int -xfs_attr_filter( - u32 ioc_flags) -{ - if (ioc_flags & XFS_IOC_ATTR_ROOT) - return XFS_ATTR_ROOT; - if (ioc_flags & XFS_IOC_ATTR_SECURE) - return XFS_ATTR_SECURE; - return 0; -} - -static unsigned int -xfs_attr_flags( - u32 ioc_flags) -{ - if (ioc_flags & XFS_IOC_ATTR_CREATE) - return XATTR_CREATE; - if (ioc_flags & XFS_IOC_ATTR_REPLACE) - return XATTR_REPLACE; - return 0; -} - -int -xfs_ioc_attr_list( - struct xfs_inode *dp, - void __user *ubuf, - size_t bufsize, - int flags, - struct xfs_attrlist_cursor __user *ucursor) -{ - struct xfs_attr_list_context context = { }; - struct xfs_attrlist *alist; - void *buffer; - int error; - - if (bufsize < sizeof(struct xfs_attrlist) || - bufsize > XFS_XATTR_LIST_MAX) - return -EINVAL; - - /* - * Reject flags, only allow namespaces. - */ - if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) - return -EINVAL; - if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) - return -EINVAL; - - /* - * Validate the cursor. - */ - if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor))) - return -EFAULT; - if (context.cursor.pad1 || context.cursor.pad2) - return -EINVAL; - if (!context.cursor.initted && - (context.cursor.hashval || context.cursor.blkno || - context.cursor.offset)) - return -EINVAL; - - buffer = kvzalloc(bufsize, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - /* - * Initialize the output buffer. - */ - context.dp = dp; - context.resynch = 1; - context.attr_filter = xfs_attr_filter(flags); - context.buffer = buffer; - context.bufsize = round_down(bufsize, sizeof(uint32_t)); - context.firstu = context.bufsize; - context.put_listent = xfs_ioc_attr_put_listent; - - alist = context.buffer; - alist->al_count = 0; - alist->al_more = 0; - alist->al_offset[0] = context.bufsize; - - error = xfs_attr_list(&context); - if (error) - goto out_free; - - if (copy_to_user(ubuf, buffer, bufsize) || - copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) - error = -EFAULT; -out_free: - kvfree(buffer); - return error; -} - -STATIC int -xfs_attrlist_by_handle( - struct file *parfilp, - struct xfs_fsop_attrlist_handlereq __user *p) -{ - struct xfs_fsop_attrlist_handlereq al_hreq; - struct dentry *dentry; - int error = -ENOMEM; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) - return -EFAULT; - - dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer, - al_hreq.buflen, al_hreq.flags, &p->pos); - dput(dentry); - return error; -} - -static int -xfs_attrmulti_attr_get( - struct inode *inode, - unsigned char *name, - unsigned char __user *ubuf, - uint32_t *len, - uint32_t flags) -{ - struct xfs_da_args args = { - .dp = XFS_I(inode), - .attr_filter = xfs_attr_filter(flags), - .attr_flags = xfs_attr_flags(flags), - .name = name, - .namelen = strlen(name), - .valuelen = *len, - }; - int error; - - if (*len > XFS_XATTR_SIZE_MAX) - return -EINVAL; - - error = xfs_attr_get(&args); - if (error) - goto out_kfree; - - *len = args.valuelen; - if (copy_to_user(ubuf, args.value, args.valuelen)) - error = -EFAULT; - -out_kfree: - kvfree(args.value); - return error; -} - -static int -xfs_attrmulti_attr_set( - struct inode *inode, - unsigned char *name, - const unsigned char __user *ubuf, - uint32_t len, - uint32_t flags) -{ - struct xfs_da_args args = { - .dp = XFS_I(inode), - .attr_filter = xfs_attr_filter(flags), - .attr_flags = xfs_attr_flags(flags), - .name = name, - .namelen = strlen(name), - }; - int error; - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - - if (ubuf) { - if (len > XFS_XATTR_SIZE_MAX) - return -EINVAL; - args.value = memdup_user(ubuf, len); - if (IS_ERR(args.value)) - return PTR_ERR(args.value); - args.valuelen = len; - } - - error = xfs_attr_change(&args); - if (!error && (flags & XFS_IOC_ATTR_ROOT)) - xfs_forget_acl(inode, name); - kfree(args.value); - return error; -} - -int -xfs_ioc_attrmulti_one( - struct file *parfilp, - struct inode *inode, - uint32_t opcode, - void __user *uname, - void __user *value, - uint32_t *len, - uint32_t flags) -{ - unsigned char *name; - int error; - - if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE)) - return -EINVAL; - - name = strndup_user(uname, MAXNAMELEN); - if (IS_ERR(name)) - return PTR_ERR(name); - - switch (opcode) { - case ATTR_OP_GET: - error = xfs_attrmulti_attr_get(inode, name, value, len, flags); - break; - case ATTR_OP_REMOVE: - value = NULL; - *len = 0; - fallthrough; - case ATTR_OP_SET: - error = mnt_want_write_file(parfilp); - if (error) - break; - error = xfs_attrmulti_attr_set(inode, name, value, *len, flags); - mnt_drop_write_file(parfilp); - break; - default: - error = -EINVAL; - break; - } - - kfree(name); - return error; -} - -STATIC int -xfs_attrmulti_by_handle( - struct file *parfilp, - void __user *arg) -{ - int error; - xfs_attr_multiop_t *ops; - xfs_fsop_attrmulti_handlereq_t am_hreq; - struct dentry *dentry; - unsigned int i, size; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) - return -EFAULT; - - /* overflow check */ - if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) - return -E2BIG; - - dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - error = -E2BIG; - size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); - if (!size || size > 16 * PAGE_SIZE) - goto out_dput; - - ops = memdup_user(am_hreq.ops, size); - if (IS_ERR(ops)) { - error = PTR_ERR(ops); - goto out_dput; - } - - error = 0; - for (i = 0; i < am_hreq.opcount; i++) { - ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, - d_inode(dentry), ops[i].am_opcode, - ops[i].am_attrname, ops[i].am_attrvalue, - &ops[i].am_length, ops[i].am_flags); - } - - if (copy_to_user(am_hreq.ops, ops, size)) - error = -EFAULT; - - kfree(ops); - out_dput: - dput(dentry); - return error; -} - /* Return 0 on success or positive error */ int xfs_fsbulkstat_one_fmt( @@ -1640,30 +1055,6 @@ out_free: return error; } -STATIC int -xfs_ioc_scrub_metadata( - struct file *file, - void __user *arg) -{ - struct xfs_scrub_metadata scrub; - int error; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&scrub, arg, sizeof(scrub))) - return -EFAULT; - - error = xfs_scrub_metadata(file, &scrub); - if (error) - return error; - - if (copy_to_user(arg, &scrub, sizeof(scrub))) - return -EFAULT; - - return 0; -} - int xfs_ioc_swapext( xfs_swapext_t *sxp) @@ -2010,7 +1401,10 @@ xfs_file_ioctl( case XFS_IOC_FSGETXATTRA: return xfs_ioc_fsgetxattra(ip, arg); - + case XFS_IOC_GETPARENTS: + return xfs_ioc_getparents(filp, arg); + case XFS_IOC_GETPARENTS_BY_HANDLE: + return xfs_ioc_getparents_by_handle(filp, arg); case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: case XFS_IOC_GETBMAPX: @@ -2019,6 +1413,8 @@ xfs_file_ioctl( case FS_IOC_GETFSMAP: return xfs_ioc_getfsmap(ip, arg); + case XFS_IOC_SCRUBV_METADATA: + return xfs_ioc_scrubv_metadata(filp, arg); case XFS_IOC_SCRUB_METADATA: return xfs_ioc_scrub_metadata(filp, arg); @@ -2169,6 +1565,9 @@ xfs_file_ioctl( return error; } + case XFS_IOC_EXCHANGE_RANGE: + return xfs_ioc_exchange_range(filp, arg); + default: return -ENOTTY; } diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 38be600b5e1e..12124946f347 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -15,34 +15,6 @@ xfs_ioc_swapext( xfs_swapext_t *sxp); extern int -xfs_find_handle( - unsigned int cmd, - xfs_fsop_handlereq_t *hreq); - -extern int -xfs_open_by_handle( - struct file *parfilp, - xfs_fsop_handlereq_t *hreq); - -extern int -xfs_readlink_by_handle( - struct file *parfilp, - xfs_fsop_handlereq_t *hreq); - -int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, - uint32_t opcode, void __user *uname, void __user *value, - uint32_t *len, uint32_t flags); -int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, - size_t bufsize, int flags, - struct xfs_attrlist_cursor __user *ucursor); - -extern struct dentry * -xfs_handle_to_dentry( - struct file *parfilp, - void __user *uhandle, - u32 hlen); - -extern int xfs_fileattr_get( struct dentry *dentry, struct fileattr *fa); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index ee35eea1ecce..b64785dc4354 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -24,6 +24,7 @@ #include "xfs_ioctl32.h" #include "xfs_trace.h" #include "xfs_sb.h" +#include "xfs_handle.h" #define _NATIVE_IOC(cmd, type) \ _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 4087af7f3c9f..378342673925 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -28,6 +28,7 @@ #include "xfs_dquot.h" #include "xfs_reflink.h" #include "xfs_health.h" +#include "xfs_rtbitmap.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -298,9 +299,7 @@ xfs_iomap_write_direct( if (error) return error; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, nr_exts); + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, nr_exts); if (error) goto out_trans_cancel; @@ -321,14 +320,6 @@ xfs_iomap_write_direct( if (error) goto out_unlock; - /* - * Copy any maps to caller's array and return any error. - */ - if (nimaps == 0) { - error = -ENOSPC; - goto out_unlock; - } - if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = xfs_alert_fsblock_zero(ip, imap); @@ -404,6 +395,29 @@ xfs_quota_calc_throttle( } } +static int64_t +xfs_iomap_freesp( + struct percpu_counter *counter, + uint64_t low_space[XFS_LOWSP_MAX], + int *shift) +{ + int64_t freesp; + + freesp = percpu_counter_read_positive(counter); + if (freesp < low_space[XFS_LOWSP_5_PCNT]) { + *shift = 2; + if (freesp < low_space[XFS_LOWSP_4_PCNT]) + (*shift)++; + if (freesp < low_space[XFS_LOWSP_3_PCNT]) + (*shift)++; + if (freesp < low_space[XFS_LOWSP_2_PCNT]) + (*shift)++; + if (freesp < low_space[XFS_LOWSP_1_PCNT]) + (*shift)++; + } + return freesp; +} + /* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size @@ -486,18 +500,13 @@ xfs_iomap_prealloc_size( alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN), alloc_blocks); - freesp = percpu_counter_read_positive(&mp->m_fdblocks); - if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { - shift = 2; - if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) - shift++; - } + if (unlikely(XFS_IS_REALTIME_INODE(ip))) + freesp = xfs_rtx_to_rtb(mp, + xfs_iomap_freesp(&mp->m_frextents, + mp->m_low_rtexts, &shift)); + else + freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space, + &shift); /* * Check each quota to cap the prealloc size, provide a shift value to @@ -606,11 +615,8 @@ xfs_iomap_write_unwritten( if (error) return error; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_WRITE_UNWRITTEN_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error) goto error_on_bmapi_transaction; @@ -982,8 +988,6 @@ xfs_buffered_write_iomap_begin( return xfs_direct_write_iomap_begin(inode, offset, count, flags, iomap, srcmap); - ASSERT(!XFS_IS_REALTIME_INODE(ip)); - error = xfs_qm_dqattach(ip); if (error) return error; @@ -1023,6 +1027,24 @@ xfs_buffered_write_iomap_begin( } /* + * For zeroing, trim a delalloc extent that extends beyond the EOF + * block. If it starts beyond the EOF block, convert it to an + * unwritten extent. + */ + if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && + isnullstartblock(imap.br_startblock)) { + xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + + if (offset_fsb >= eof_fsb) + goto convert_delay; + if (end_fsb > eof_fsb) { + end_fsb = eof_fsb; + xfs_trim_extent(&imap, offset_fsb, + end_fsb - offset_fsb); + } + } + + /* * Search the COW fork extent list even if we did not find a data fork * extent. This serves two purposes: first this implements the * speculative preallocation using cowextsize, so that we also unshare @@ -1158,15 +1180,26 @@ retry: * them out if the write happens to fail. */ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: seq = xfs_iomap_inode_sequence(ip, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); +convert_delay: + xfs_iunlock(ip, lockmode); + truncate_pagecache(inode, offset); + error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset, + iomap, NULL); + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap); + return 0; + found_cow: seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { @@ -1174,17 +1207,17 @@ found_cow: if (error) goto out_unlock; seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; } @@ -1194,8 +1227,8 @@ xfs_buffered_write_delalloc_punch( loff_t offset, loff_t length) { - return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, - offset + length); + xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length); + return 0; } static int diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 66f8c47642e8..ff222827e550 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -25,6 +25,7 @@ #include "xfs_error.h" #include "xfs_ioctl.h" #include "xfs_xattr.h" +#include "xfs_file.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -62,7 +63,7 @@ xfs_initxattrs( .value = xattr->value, .valuelen = xattr->value_len, }; - error = xfs_attr_change(&args); + error = xfs_attr_change(&args, XFS_ATTRUPDATE_UPSERT); if (error < 0) break; } @@ -156,6 +157,8 @@ xfs_create_need_xattr( if (dir->i_sb->s_security) return true; #endif + if (xfs_has_parent(XFS_I(dir)->i_mount)) + return true; return false; } @@ -200,7 +203,18 @@ xfs_generic_create( xfs_create_need_xattr(dir, default_acl, acl), &ip); } else { - error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, &ip); + bool init_xattrs = false; + + /* + * If this temporary file will be linkable, set up the file + * with an attr fork to receive a parent pointer. + */ + if (!(tmpfile->f_flags & O_EXCL) && + xfs_has_parent(XFS_I(dir)->i_mount)) + init_xattrs = true; + + error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, + init_xattrs, &ip); } if (unlikely(error)) goto out_free_acl; @@ -364,6 +378,9 @@ xfs_vn_link( if (unlikely(error)) return error; + if (IS_PRIVATE(inode)) + return -EPERM; + error = xfs_link(XFS_I(dir), XFS_I(inode), &name); if (unlikely(error)) return error; @@ -521,7 +538,7 @@ xfs_stat_blksize( * always return the realtime extent size. */ if (XFS_IS_REALTIME_INODE(ip)) - return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip)); + return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip) ? : 1); /* * Allow large block sizes to be reported to userspace programs if the diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 7f84a0843b24..3c1a2605ffd2 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -8,9 +8,6 @@ struct xfs_inode; -extern const struct file_operations xfs_file_operations; -extern const struct file_operations xfs_dir_file_operations; - extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); int xfs_vn_setattr_size(struct mnt_idmap *idmap, @@ -19,4 +16,8 @@ int xfs_vn_setattr_size(struct mnt_idmap *idmap, int xfs_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr); +extern void xfs_setup_inode(struct xfs_inode *ip); +extern void xfs_setup_iops(struct xfs_inode *ip); +extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); + #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 95fc31b9f87d..c0757ab99495 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -97,6 +97,14 @@ xfs_bulkstat_one_int( vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); + /* If this is a private inode, don't leak its details to userspace. */ + if (IS_PRIVATE(inode)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_irele(ip); + error = -EINVAL; + goto out_advance; + } + /* xfs_iget returns the following without needing * further change. */ diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 01b55f03a102..730c8d48da28 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -268,7 +268,7 @@ xfs_iwalk_ag_start( /* Set up a fresh cursor and empty the inobt cache. */ iwag->nr_recs = 0; - error = xfs_ialloc_read_agi(pag, tp, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp); if (error) return error; *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp); @@ -386,7 +386,7 @@ xfs_iwalk_run_callbacks( } /* ...and recreate the cursor just past where we left off. */ - error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp); + error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, 0, agi_bpp); if (error) return error; *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 8f07c9f6157f..ac355328121a 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -198,6 +198,11 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) return x; } +static inline bool isaligned_64(uint64_t x, uint32_t y) +{ + return do_div(x, y) == 0; +} + /* If @b is a power of 2, return log2(b). Else return -1. */ static inline int8_t log2_if_power2(unsigned long b) { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 5004f23d344e..416c15494983 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1448,7 +1448,7 @@ xfs_log_work_queue( * Clear the log incompat flags if we have the opportunity. * * This only happens if we're about to log the second dummy transaction as part - * of covering the log and we can get the log incompat feature usage lock. + * of covering the log. */ static inline void xlog_clear_incompat( @@ -1463,11 +1463,7 @@ xlog_clear_incompat( if (log->l_covered_state != XLOG_STATE_COVER_DONE2) return; - if (!down_write_trylock(&log->l_incompat_users)) - return; - xfs_clear_incompat_log_features(mp); - up_write(&log->l_incompat_users); } /* @@ -1585,8 +1581,6 @@ xlog_alloc_log( } log->l_sectBBsize = 1 << log2_size; - init_rwsem(&log->l_incompat_users); - xlog_get_iclog_buffer_size(mp, log); spin_lock_init(&log->l_icloglock); @@ -3871,23 +3865,3 @@ xfs_log_check_lsn( return valid; } - -/* - * Notify the log that we're about to start using a feature that is protected - * by a log incompat feature flag. This will prevent log covering from - * clearing those flags. - */ -void -xlog_use_incompat_feat( - struct xlog *log) -{ - down_read(&log->l_incompat_users); -} - -/* Notify the log that we've finished using log incompat features. */ -void -xlog_drop_incompat_feat( - struct xlog *log) -{ - up_read(&log->l_incompat_users); -} diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 2728886c2963..d69acf881153 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -159,8 +159,6 @@ bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags); -void xlog_use_incompat_feat(struct xlog *log); -void xlog_drop_incompat_feat(struct xlog *log); int xfs_attr_use_log_assist(struct xfs_mount *mp); #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 73f5b7f628f4..f51cbc6405c1 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1378,7 +1378,7 @@ out_abort_free_ticket: */ static void xlog_cil_push_background( - struct xlog *log) __releases(cil->xc_ctx_lock) + struct xlog *log) { struct xfs_cil *cil = log->l_cilp; int space_used = atomic_read(&cil->xc_ctx->space_used); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index e30c06ec20e3..40e22ec0fbe6 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -450,9 +450,6 @@ struct xlog { xfs_lsn_t l_recovery_lsn; uint32_t l_iclog_roundoff;/* padding roundoff */ - - /* Users of log incompat features should take a read lock. */ - struct rw_semaphore l_incompat_users; }; /* @@ -623,7 +620,8 @@ xlog_wait( remove_wait_queue(wq, &wait); } -int xlog_wait_on_iclog(struct xlog_in_core *iclog); +int xlog_wait_on_iclog(struct xlog_in_core *iclog) + __releases(iclog->ic_log->l_icloglock); /* * The LSN is valid so long as it is behind the current LSN. If it isn't, this @@ -683,7 +681,7 @@ xlog_valid_lsn( * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() * will do direct reclaim and compaction in the slow path, both of which are * horrendously expensive. We just want kmalloc to fail fast and fall back to - * vmalloc if it can't get somethign straight away from the free lists or + * vmalloc if it can't get something straight away from the free lists or * buddy allocator. Hence we have to open code kvmalloc outselves here. * * This assumes that the caller uses memalloc_nofs_save task context here, so diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 13f1d2e91540..4fe627991e86 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1767,6 +1767,37 @@ xlog_recover_iget( return 0; } +/* + * Get an inode so that we can recover a log operation. + * + * Log intent items that target inodes effectively contain a file handle. + * Check that the generation number matches the intent item like we do for + * other file handles. Log intent items defined after this validation weakness + * was identified must use this function. + */ +int +xlog_recover_iget_handle( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t gen, + struct xfs_inode **ipp) +{ + struct xfs_inode *ip; + int error; + + error = xlog_recover_iget(mp, ino, &ip); + if (error) + return error; + + if (VFS_I(ip)->i_generation != gen) { + xfs_irele(ip); + return -EFSCORRUPTED; + } + + *ipp = ip; + return 0; +} + /****************************************************************************** * * Log recover routines @@ -1789,6 +1820,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_bud_item_ops, &xlog_attri_item_ops, &xlog_attrd_item_ops, + &xlog_xmi_item_ops, + &xlog_xmd_item_ops, }; static const struct xlog_recover_item_ops * @@ -2656,7 +2689,7 @@ xlog_recover_clear_agi_bucket( if (error) goto out_error; - error = xfs_read_agi(pag, tp, &agibp); + error = xfs_read_agi(pag, tp, 0, &agibp); if (error) goto out_abort; @@ -2772,7 +2805,7 @@ xlog_recover_iunlink_ag( int bucket; int error; - error = xfs_read_agi(pag, NULL, &agibp); + error = xfs_read_agi(pag, NULL, 0, &agibp); if (error) { /* * AGI is b0rked. Don't process it. @@ -2966,7 +2999,7 @@ xlog_do_recovery_pass( int error = 0, h_size, h_len; int error2 = 0; int bblks, split_bblks; - int hblks, split_hblks, wrapped_hblks; + int hblks = 1, split_hblks, wrapped_hblks; int i; struct hlist_head rhash[XLOG_RHASH_SIZE]; LIST_HEAD (buffer_list); @@ -2977,6 +3010,10 @@ xlog_do_recovery_pass( for (i = 0; i < XLOG_RHASH_SIZE; i++) INIT_HLIST_HEAD(&rhash[i]); + hbp = xlog_alloc_buffer(log, hblks); + if (!hbp) + return -ENOMEM; + /* * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. @@ -2987,10 +3024,6 @@ xlog_do_recovery_pass( * iclog header and extract the header size from it. Get a * new hbp that is the correct size. */ - hbp = xlog_alloc_buffer(log, 1); - if (!hbp) - return -ENOMEM; - error = xlog_bread(log, tail_blk, 1, hbp, &offset); if (error) goto bread_err1; @@ -3022,20 +3055,27 @@ xlog_do_recovery_pass( if (error) goto bread_err1; - hblks = xlog_logrec_hblks(log, rhead); - if (hblks != 1) { - kvfree(hbp); - hbp = xlog_alloc_buffer(log, hblks); + /* + * This open codes xlog_logrec_hblks so that we can reuse the + * fixed up h_size value calculated above. Without that we'd + * still allocate the buffer based on the incorrect on-disk + * size. + */ + if (h_size > XLOG_HEADER_CYCLE_SIZE && + (rhead->h_version & cpu_to_be32(XLOG_VERSION_2))) { + hblks = DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE); + if (hblks > 1) { + kvfree(hbp); + hbp = xlog_alloc_buffer(log, hblks); + if (!hbp) + return -ENOMEM; + } } } else { ASSERT(log->l_sectBBsize == 1); - hblks = 1; - hbp = xlog_alloc_buffer(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; } - if (!hbp) - return -ENOMEM; dbp = xlog_alloc_buffer(log, BTOBB(h_size)); if (!dbp) { kvfree(hbp); @@ -3496,21 +3536,6 @@ xlog_recover_finish( */ xfs_log_force(log->l_mp, XFS_LOG_SYNC); - /* - * Now that we've recovered the log and all the intents, we can clear - * the log incompat feature bits in the superblock because there's no - * longer anything to protect. We rely on the AIL push to write out the - * updated superblock after everything else. - */ - if (xfs_clear_incompat_log_features(log->l_mp)) { - error = xfs_sync_sb(log->l_mp, false); - if (error < 0) { - xfs_alert(log->l_mp, - "Failed to clear log incompat features on recovery"); - goto out_error; - } - } - xlog_recover_process_iunlinks(log); /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index df370eb5dc15..09eef1721ef4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -34,6 +34,7 @@ #include "xfs_health.h" #include "xfs_trace.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" #include "scrub/stats.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -230,6 +231,13 @@ reread: mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); + /* + * If logged xattrs are enabled after log recovery finishes, then set + * the opstate so that log recovery will work properly. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + xfs_set_using_logged_xattrs(mp); + /* no need to be quiet anymore, so reset the buf ops */ bp->b_ops = &xfs_sb_buf_ops; @@ -828,6 +836,15 @@ xfs_mountfs( goto out_inodegc_shrinker; } + /* + * If logged xattrs are still enabled after log recovery finishes, then + * they'll be available until unmount. Otherwise, turn them off. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + xfs_set_using_logged_xattrs(mp); + else + xfs_clear_using_logged_xattrs(mp); + /* Enable background inode inactivation workers. */ xfs_inodegc_start(mp); xfs_blockgc_start(mp); @@ -1095,6 +1112,11 @@ xfs_unmountfs( "Freespace may not be correct on next mount."); xfs_unmount_check(mp); + /* + * Indicate that it's ok to clear log incompat bits before cleaning + * the log and writing the unmount record. + */ + xfs_set_done_with_log_incompat(mp); xfs_log_unmount(mp); xfs_da_unmount(mp); xfs_uuid_unmount(mp); @@ -1131,16 +1153,44 @@ xfs_fs_writable( return true; } -/* Adjust m_fdblocks or m_frextents. */ +void +xfs_add_freecounter( + struct xfs_mount *mp, + struct percpu_counter *counter, + uint64_t delta) +{ + bool has_resv_pool = (counter == &mp->m_fdblocks); + uint64_t res_used; + + /* + * If the reserve pool is depleted, put blocks back into it first. + * Most of the time the pool is full. + */ + if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) { + percpu_counter_add(counter, delta); + return; + } + + spin_lock(&mp->m_sb_lock); + res_used = mp->m_resblks - mp->m_resblks_avail; + if (res_used > delta) { + mp->m_resblks_avail += delta; + } else { + delta -= res_used; + mp->m_resblks_avail = mp->m_resblks; + percpu_counter_add(counter, delta); + } + spin_unlock(&mp->m_sb_lock); +} + int -xfs_mod_freecounter( +xfs_dec_freecounter( struct xfs_mount *mp, struct percpu_counter *counter, - int64_t delta, + uint64_t delta, bool rsvd) { int64_t lcounter; - long long res_used; uint64_t set_aside = 0; s32 batch; bool has_resv_pool; @@ -1150,31 +1200,6 @@ xfs_mod_freecounter( if (rsvd) ASSERT(has_resv_pool); - if (delta > 0) { - /* - * If the reserve pool is depleted, put blocks back into it - * first. Most of the time the pool is full. - */ - if (likely(!has_resv_pool || - mp->m_resblks == mp->m_resblks_avail)) { - percpu_counter_add(counter, delta); - return 0; - } - - spin_lock(&mp->m_sb_lock); - res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); - - if (res_used > delta) { - mp->m_resblks_avail += delta; - } else { - delta -= res_used; - mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(counter, delta); - } - spin_unlock(&mp->m_sb_lock); - return 0; - } - /* * Taking blocks away, need to be more accurate the closer we * are to zero. @@ -1202,7 +1227,7 @@ xfs_mod_freecounter( */ if (has_resv_pool) set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(counter, delta, batch); + percpu_counter_add_batch(counter, -((int64_t)delta), batch); if (__percpu_counter_compare(counter, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ @@ -1214,11 +1239,11 @@ xfs_mod_freecounter( * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - percpu_counter_add(counter, -delta); + percpu_counter_add(counter, delta); if (!has_resv_pool || !rsvd) goto fdblocks_enospc; - lcounter = (long long)mp->m_resblks_avail + delta; + lcounter = (long long)mp->m_resblks_avail - delta; if (lcounter >= 0) { mp->m_resblks_avail = lcounter; spin_unlock(&mp->m_sb_lock); @@ -1364,7 +1389,8 @@ xfs_clear_incompat_log_features( if (!xfs_has_crc(mp) || !xfs_sb_has_incompat_log_feature(&mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_ALL) || - xfs_is_shutdown(mp)) + xfs_is_shutdown(mp) || + !xfs_is_done_with_log_incompat(mp)) return false; /* @@ -1399,9 +1425,20 @@ xfs_clear_incompat_log_features( #define XFS_DELALLOC_BATCH (4096) void xfs_mod_delalloc( - struct xfs_mount *mp, - int64_t delta) + struct xfs_inode *ip, + int64_t data_delta, + int64_t ind_delta) { - percpu_counter_add_batch(&mp->m_delalloc_blks, delta, + struct xfs_mount *mp = ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ip)) { + percpu_counter_add_batch(&mp->m_delalloc_rtextents, + xfs_rtb_to_rtx(mp, data_delta), + XFS_DELALLOC_BATCH); + if (!ind_delta) + return; + data_delta = 0; + } + percpu_counter_add_batch(&mp->m_delalloc_blks, data_delta + ind_delta, XFS_DELALLOC_BATCH); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e880aa48de68..d0567dfbc036 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -195,6 +195,12 @@ typedef struct xfs_mount { * extents or anything related to the rt device. */ struct percpu_counter m_delalloc_blks; + + /* + * RT version of the above. + */ + struct percpu_counter m_delalloc_rtextents; + /* * Global count of allocation btree blocks in use across all AGs. Only * used when perag reservation is enabled. Helps prevent block @@ -292,6 +298,7 @@ typedef struct xfs_mount { #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ +#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -331,19 +338,10 @@ static inline void xfs_add_ ## name (struct xfs_mount *mp) \ __XFS_ADD_FEAT(attr, ATTR) __XFS_HAS_FEAT(nlink, NLINK) __XFS_ADD_FEAT(quota, QUOTA) -__XFS_HAS_FEAT(align, ALIGN) __XFS_HAS_FEAT(dalign, DALIGN) -__XFS_HAS_FEAT(logv2, LOGV2) __XFS_HAS_FEAT(sector, SECTOR) -__XFS_HAS_FEAT(extflg, EXTFLG) __XFS_HAS_FEAT(asciici, ASCIICI) -__XFS_HAS_FEAT(lazysbcount, LAZYSBCOUNT) -__XFS_ADD_FEAT(attr2, ATTR2) __XFS_HAS_FEAT(parent, PARENT) -__XFS_ADD_FEAT(projid32, PROJID32) -__XFS_HAS_FEAT(crc, CRC) -__XFS_HAS_FEAT(v3inodes, V3INODES) -__XFS_HAS_FEAT(pquotino, PQUOTINO) __XFS_HAS_FEAT(ftype, FTYPE) __XFS_HAS_FEAT(finobt, FINOBT) __XFS_HAS_FEAT(rmapbt, RMAPBT) @@ -355,6 +353,38 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT) __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) +__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) + +/* + * Some features are always on for v5 file systems, allow the compiler to + * eliminiate dead code when building without v4 support. + */ +#define __XFS_HAS_V4_FEAT(name, NAME) \ +static inline bool xfs_has_ ## name (struct xfs_mount *mp) \ +{ \ + return !IS_ENABLED(CONFIG_XFS_SUPPORT_V4) || \ + (mp->m_features & XFS_FEAT_ ## NAME); \ +} + +#define __XFS_ADD_V4_FEAT(name, NAME) \ + __XFS_HAS_V4_FEAT(name, NAME); \ +static inline void xfs_add_ ## name (struct xfs_mount *mp) \ +{ \ + if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) { \ + mp->m_features |= XFS_FEAT_ ## NAME; \ + xfs_sb_version_add ## name(&mp->m_sb); \ + } \ +} + +__XFS_HAS_V4_FEAT(align, ALIGN) +__XFS_HAS_V4_FEAT(logv2, LOGV2) +__XFS_HAS_V4_FEAT(extflg, EXTFLG) +__XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT) +__XFS_ADD_V4_FEAT(attr2, ATTR2) +__XFS_ADD_V4_FEAT(projid32, PROJID32) +__XFS_HAS_V4_FEAT(v3inodes, V3INODES) +__XFS_HAS_V4_FEAT(crc, CRC) +__XFS_HAS_V4_FEAT(pquotino, PQUOTINO) /* * Mount features @@ -412,6 +442,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_LARP 9 /* Mount time quotacheck is running */ #define XFS_OPSTATE_QUOTACHECK_RUNNING 10 +/* Do we want to clear log incompat flags? */ +#define XFS_OPSTATE_UNSET_LOG_INCOMPAT 11 +/* Filesystem can use logged extended attributes */ +#define XFS_OPSTATE_USE_LARP 12 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -439,6 +473,8 @@ __XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING) #else # define xfs_is_quotacheck_running(mp) (false) #endif +__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT) +__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP) static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -457,7 +493,9 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ - { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" } + { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \ + { (1UL << XFS_OPSTATE_UNSET_LOG_INCOMPAT), "unset_log_incompat" }, \ + { (1UL << XFS_OPSTATE_USE_LARP), "logged_xattrs" } /* * Max and min values for mount-option defined I/O @@ -534,19 +572,30 @@ xfs_fdblocks_unavailable( return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); } -int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, - int64_t delta, bool rsvd); +int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, + uint64_t delta, bool rsvd); +void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, + uint64_t delta); + +static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta, + bool reserved) +{ + return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved); +} + +static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta) +{ + xfs_add_freecounter(mp, &mp->m_fdblocks, delta); +} -static inline int -xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved) +static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta) { - return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved); + return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false); } -static inline int -xfs_mod_frextents(struct xfs_mount *mp, int64_t delta) +static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta) { - return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false); + xfs_add_freecounter(mp, &mp->m_frextents, delta); } extern int xfs_readsb(xfs_mount_t *, int); @@ -566,6 +615,7 @@ struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, void xfs_force_summary_recalc(struct xfs_mount *mp); int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); bool xfs_clear_incompat_log_features(struct xfs_mount *mp); -void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta); +void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, + int64_t ind_delta); #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 0f4cf4170c35..47120b745c47 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -836,8 +836,10 @@ xfs_qm_qino_alloc( ASSERT(xfs_is_shutdown(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); } - if (need_alloc) + if (need_alloc) { + xfs_iunlock(*ipp, XFS_ILOCK_EXCL); xfs_finish_inode_setup(*ipp); + } return error; } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index f5993012bf98..6e09dfcd13e2 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -136,7 +136,7 @@ enum { XFS_QM_TRANS_PRJ, XFS_QM_TRANS_DQTYPES }; -#define XFS_QM_TRANS_MAXDQS 2 +#define XFS_QM_TRANS_MAXDQS 5 struct xfs_dquot_acct { struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS]; }; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 85a4ae1a17f6..23d71a55bbc0 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -123,12 +123,6 @@ extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); - -static inline int -xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) -{ - return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); -} bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type); # ifdef CONFIG_XFS_LIVE_HOOKS @@ -188,12 +182,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, } static inline int -xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) -{ - return 0; -} - -static inline int xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks) { @@ -222,9 +210,16 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #endif /* CONFIG_XFS_QUOTA */ static inline int -xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks) +xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); +} + +static inline void +xfs_quota_unreserve_blkres(struct xfs_inode *ip, uint64_t blocks) { - return xfs_quota_reserve_blkres(ip, -blocks); + /* don't return an error as unreserving quotas can't fail */ + xfs_quota_reserve_blkres(ip, -(int64_t)blocks); } extern int xfs_mount_reset_sbqflags(struct xfs_mount *); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 7da0e8f961d3..063a2e00d169 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -430,13 +430,6 @@ xfs_reflink_fill_cow_hole( if (error) return error; - /* - * Allocation succeeded but the requested range was not even partially - * satisfied? Bail out! - */ - if (nimaps == 0) - return -ENOSPC; - convert: return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); @@ -499,13 +492,6 @@ xfs_reflink_fill_delalloc( error = xfs_trans_commit(tp); if (error) return error; - - /* - * Allocation succeeded but the requested range was not even - * partially satisfied? Bail out! - */ - if (nimaps == 0) - return -ENOSPC; } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); @@ -606,10 +592,8 @@ xfs_reflink_cancel_cow_blocks( trace_xfs_reflink_cancel_cow(ip, &del); if (isnullstartblock(del.br_startblock)) { - error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, - &icur, &got, &del); - if (error) - break; + xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got, + &del); } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); @@ -632,10 +616,7 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &icur, &got, &del); /* Remove the quota reservation */ - error = xfs_quota_unreserve_blkres(ip, - del.br_blockcount); - if (error) - break; + xfs_quota_unreserve_blkres(ip, del.br_blockcount); } else { /* Didn't do anything, push cursor back. */ xfs_iext_prev(ifp, &icur); @@ -731,12 +712,6 @@ xfs_reflink_end_cow_extent( int nmaps; int error; - /* No COW extents? That's easy! */ - if (ifp->if_bytes == 0) { - *offset_fsb = end_fsb; - return 0; - } - resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, XFS_TRANS_RESERVE, &tp); @@ -751,14 +726,6 @@ xfs_reflink_end_cow_extent( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, - XFS_IEXT_REFLINK_END_COW_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_REFLINK_END_COW_CNT); - if (error) - goto out_cancel; - /* * In case of racing, overlapping AIO writes no COW extents might be * left by the time I/O completes for the loser of the race. In that @@ -787,6 +754,11 @@ xfs_reflink_end_cow_extent( del = got; xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb); + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, + XFS_IEXT_REFLINK_END_COW_CNT); + if (error) + goto out_cancel; + /* Grab the corresponding mapping in the data fork. */ nmaps = 1; error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, @@ -1283,9 +1255,7 @@ xfs_reflink_remap_extent( if (dmap_written) ++iext_delta; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, iext_delta); + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, iext_delta); if (error) goto out_cancel; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index e66f9bd5de5c..5a7ddfed1bb8 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -695,11 +695,8 @@ xfs_growfs_rt_alloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, - XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; @@ -709,8 +706,6 @@ xfs_growfs_rt_alloc( nmap = 1; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_METADATA, 0, &map, &nmap); - if (!error && nmap < 1) - error = -ENOSPC; if (error) goto out_trans_cancel; /* @@ -957,10 +952,10 @@ xfs_growfs_rt( nargs.tp = tp; /* - * Lock out other callers by grabbing the bitmap inode lock. + * Lock out other callers by grabbing the bitmap and summary + * inode locks and joining them to the transaction. */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_rtbitmap_lock(tp, mp); /* * Update the bitmap inode's size ondisk and incore. We need * to update the incore size so that inode inactivation won't @@ -971,11 +966,6 @@ xfs_growfs_rt( i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_disk_size); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); /* - * Get the summary inode into the transaction. - */ - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); - xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); - /* * Update the summary inode's size. We need to update the * incore size so that inode inactivation won't punch what it * thinks are "posteof" blocks. @@ -1142,10 +1132,10 @@ xfs_rtalloc_reinit_frextents( uint64_t val = 0; int error; - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, &val); - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); if (error) return error; @@ -1346,6 +1336,8 @@ xfs_bmap_rtalloc( int error; align = xfs_get_extsz_hint(ap->ip); + if (!align) + align = 1; retry: error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, @@ -1382,10 +1374,7 @@ retry: * Lock out modifications to both the RT bitmap and summary inodes */ if (!rtlocked) { - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_rtbitmap_lock(ap->tp, mp); rtlocked = true; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bce020374c5e..27e9f749c4c7 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -43,6 +43,8 @@ #include "xfs_iunlink_item.h" #include "xfs_dahash_test.h" #include "xfs_rtbitmap.h" +#include "xfs_exchmaps_item.h" +#include "xfs_parent.h" #include "scrub/stats.h" #include "scrub/rcbag_btree.h" @@ -1051,12 +1053,18 @@ xfs_init_percpu_counters( if (error) goto free_fdblocks; - error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); + error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL); if (error) goto free_delalloc; + error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); + if (error) + goto free_delalloc_rt; + return 0; +free_delalloc_rt: + percpu_counter_destroy(&mp->m_delalloc_rtextents); free_delalloc: percpu_counter_destroy(&mp->m_delalloc_blks); free_fdblocks: @@ -1086,6 +1094,9 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_ifree); percpu_counter_destroy(&mp->m_fdblocks); ASSERT(xfs_is_shutdown(mp) || + percpu_counter_sum(&mp->m_delalloc_rtextents) == 0); + percpu_counter_destroy(&mp->m_delalloc_rtextents); + ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); percpu_counter_destroy(&mp->m_frextents); @@ -1579,17 +1590,21 @@ xfs_fs_fill_super( if (error) goto out_free_sb; - /* V4 support is undergoing deprecation. */ - if (!xfs_has_crc(mp)) { -#ifdef CONFIG_XFS_SUPPORT_V4 + /* + * V4 support is undergoing deprecation. + * + * Note: this has to use an open coded m_features check as xfs_has_crc + * always returns false for !CONFIG_XFS_SUPPORT_V4. + */ + if (!(mp->m_features & XFS_FEAT_CRC)) { + if (!IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) { + xfs_warn(mp, + "Deprecated V4 format (crc=0) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; + } xfs_warn_once(mp, "Deprecated V4 format (crc=0) will not be supported after September 2030."); -#else - xfs_warn(mp, - "Deprecated V4 format (crc=0) not supported by kernel."); - error = -EINVAL; - goto out_free_sb; -#endif } /* ASCII case insensitivity is undergoing deprecation. */ @@ -1727,6 +1742,14 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_exchange_range(mp)) + xfs_warn(mp, + "EXPERIMENTAL exchange-range feature enabled. Use at your own risk!"); + + if (xfs_has_parent(mp)) + xfs_warn(mp, + "EXPERIMENTAL parent pointer feature enabled. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1873,11 +1896,7 @@ xfs_remount_ro( xfs_inodegc_stop(mp); /* Free the per-AG metadata reservation pool. */ - error = xfs_fs_unreserve_ag_blocks(mp); - if (error) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; - } + xfs_fs_unreserve_ag_blocks(mp); /* * Before we sync the metadata, we need to free up the reserve block @@ -2185,8 +2204,32 @@ xfs_init_caches(void) if (!xfs_iunlink_cache) goto out_destroy_attri_cache; + xfs_xmd_cache = kmem_cache_create("xfs_xmd_item", + sizeof(struct xfs_xmd_log_item), + 0, 0, NULL); + if (!xfs_xmd_cache) + goto out_destroy_iul_cache; + + xfs_xmi_cache = kmem_cache_create("xfs_xmi_item", + sizeof(struct xfs_xmi_log_item), + 0, 0, NULL); + if (!xfs_xmi_cache) + goto out_destroy_xmd_cache; + + xfs_parent_args_cache = kmem_cache_create("xfs_parent_args", + sizeof(struct xfs_parent_args), + 0, 0, NULL); + if (!xfs_parent_args_cache) + goto out_destroy_xmi_cache; + return 0; + out_destroy_xmi_cache: + kmem_cache_destroy(xfs_xmi_cache); + out_destroy_xmd_cache: + kmem_cache_destroy(xfs_xmd_cache); + out_destroy_iul_cache: + kmem_cache_destroy(xfs_iunlink_cache); out_destroy_attri_cache: kmem_cache_destroy(xfs_attri_cache); out_destroy_attrd_cache: @@ -2243,6 +2286,9 @@ xfs_destroy_caches(void) * destroy caches. */ rcu_barrier(); + kmem_cache_destroy(xfs_parent_args_cache); + kmem_cache_destroy(xfs_xmd_cache); + kmem_cache_destroy(xfs_xmi_cache); kmem_cache_destroy(xfs_iunlink_cache); kmem_cache_destroy(xfs_attri_cache); kmem_cache_destroy(xfs_attrd_cache); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 3e376d24c7c1..17aee806ec2e 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -25,6 +25,8 @@ #include "xfs_error.h" #include "xfs_health.h" #include "xfs_symlink_remote.h" +#include "xfs_parent.h" +#include "xfs_defer.h" int xfs_readlink( @@ -100,6 +102,7 @@ xfs_symlink( struct xfs_dquot *pdqp = NULL; uint resblks; xfs_ino_t ino; + struct xfs_parent_args *ppargs; *ipp = NULL; @@ -130,18 +133,24 @@ xfs_symlink( /* * The symlink will fit into the inode data fork? - * There can't be any attributes so we get the whole variable part. + * If there are no parent pointers, then there wont't be any attributes. + * So we get the whole variable part, and do not need to reserve extra + * blocks. Otherwise, we need to reserve the blocks. */ - if (pathlen <= XFS_LITINO(mp)) + if (pathlen <= XFS_LITINO(mp) && !xfs_has_parent(mp)) fs_blocks = 0; else fs_blocks = xfs_symlink_blocks(mp, pathlen); - resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); + resblks = xfs_symlink_space_res(mp, link_name->len, fs_blocks); + + error = xfs_parent_start(mp, &ppargs); + if (error) + goto out_release_dquots; error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp, pdqp, resblks, &tp); if (error) - goto out_release_dquots; + goto out_parent; xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -161,7 +170,7 @@ xfs_symlink( if (!error) error = xfs_init_new_inode(idmap, tp, dp, ino, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, - false, &ip); + xfs_has_parent(mp), &ip); if (error) goto out_trans_cancel; @@ -172,8 +181,7 @@ xfs_symlink( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = false; + xfs_trans_ijoin(tp, dp, 0); /* * Also attach the dquot(s) to it, if applicable. @@ -181,8 +189,8 @@ xfs_symlink( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); resblks -= XFS_IALLOC_SPACE_RES(mp); - error = xfs_symlink_write_target(tp, ip, target_path, pathlen, - fs_blocks, resblks); + error = xfs_symlink_write_target(tp, ip, ip->i_ino, target_path, + pathlen, fs_blocks, resblks); if (error) goto out_trans_cancel; resblks -= fs_blocks; @@ -196,6 +204,14 @@ xfs_symlink( goto out_trans_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* Add parent pointer for the new symlink. */ + if (ppargs) { + error = xfs_parent_addname(tp, ppargs, dp, link_name, ip); + if (error) + goto out_trans_cancel; + } + xfs_dir_update_hook(dp, ip, 1, link_name); /* @@ -215,6 +231,9 @@ xfs_symlink( xfs_qm_dqrele(pdqp); *ipp = ip; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_parent_finish(mp, ppargs); return 0; out_trans_cancel: @@ -226,9 +245,12 @@ out_release_inode: * transactions and deadlocks from xfs_inactive. */ if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_finish_inode_setup(ip); xfs_irele(ip); } +out_parent: + xfs_parent_finish(mp, ppargs); out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -250,19 +272,12 @@ out_release_dquots: */ STATIC int xfs_inactive_symlink_rmt( - struct xfs_inode *ip) + struct xfs_inode *ip) { - struct xfs_buf *bp; - int done; - int error; - int i; - xfs_mount_t *mp; - xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; - int nmaps; - int size; - xfs_trans_t *tp; - - mp = ip->i_mount; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + ASSERT(!xfs_need_iread_extents(&ip->i_df)); /* * We're freeing a symlink that has some @@ -286,44 +301,14 @@ xfs_inactive_symlink_rmt( * locked for the second transaction. In the error paths we need it * held so the cancel won't rele it, see below. */ - size = (int)ip->i_disk_size; ip->i_disk_size = 0; VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Find the block(s) so we can inval and unmap them. - */ - done = 0; - nmaps = ARRAY_SIZE(mval); - error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), - mval, &nmaps, 0); - if (error) - goto error_trans_cancel; - /* - * Invalidate the block(s). No validation is done. - */ - for (i = 0; i < nmaps; i++) { - error = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), - XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0, - &bp); - if (error) - goto error_trans_cancel; - xfs_trans_binval(tp, bp); - } - /* - * Unmap the dead block(s) to the dfops. - */ - error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps, &done); + + error = xfs_symlink_remote_truncate(tp, ip); if (error) goto error_trans_cancel; - ASSERT(done); - /* - * Commit the transaction. This first logs the EFI and the inode, then - * rolls and commits the transaction that frees the extents. - */ - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp); if (error) { ASSERT(xfs_is_shutdown(mp)); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 1a963382e5e9..9c7fbaae2717 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -39,6 +39,9 @@ #include "xfs_buf_mem.h" #include "xfs_btree_mem.h" #include "xfs_bmap.h" +#include "xfs_exchmaps.h" +#include "xfs_exchrange.h" +#include "xfs_parent.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index aea97fc074f8..05cb59bd0b80 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -31,6 +31,8 @@ * pos: file offset, in bytes * bytecount: number of bytes * + * dablk: directory or xattr block offset, in filesystem blocks + * * disize: ondisk file size, in bytes * isize: incore file size, in bytes * @@ -82,11 +84,18 @@ struct xfs_perag; struct xfbtree; struct xfs_btree_ops; struct xfs_bmap_intent; +struct xfs_exchmaps_intent; +struct xfs_exchmaps_req; +struct xfs_exchrange; +struct xfs_getparents; +struct xfs_parent_irec; +struct xfs_attrlist_cursor_kern; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ { XFS_ATTR_SECURE, "SECURE" }, \ - { XFS_ATTR_INCOMPLETE, "INCOMPLETE" } + { XFS_ATTR_INCOMPLETE, "INCOMPLETE" }, \ + { XFS_ATTR_PARENT, "PARENT" } DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -1654,7 +1663,6 @@ DEFINE_EVENT(xfs_extent_busy_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len), \ TP_ARGS(mp, agno, agbno, len)) DEFINE_BUSY_EVENT(xfs_extent_busy); -DEFINE_BUSY_EVENT(xfs_extent_busy_enomem); DEFINE_BUSY_EVENT(xfs_extent_busy_force); DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); DEFINE_BUSY_EVENT(xfs_extent_busy_clear); @@ -1928,6 +1936,7 @@ DECLARE_EVENT_CLASS(xfs_da_class, __field(xfs_dahash_t, hashval) __field(xfs_ino_t, inumber) __field(uint32_t, op_flags) + __field(xfs_ino_t, owner) ), TP_fast_assign( __entry->dev = VFS_I(args->dp)->i_sb->s_dev; @@ -1938,9 +1947,10 @@ DECLARE_EVENT_CLASS(xfs_da_class, __entry->hashval = args->hashval; __entry->inumber = args->inumber; __entry->op_flags = args->op_flags; + __entry->owner = args->owner; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " - "inumber 0x%llx op_flags %s", + "inumber 0x%llx op_flags %s owner 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -1948,7 +1958,8 @@ DECLARE_EVENT_CLASS(xfs_da_class, __entry->namelen, __entry->hashval, __entry->inumber, - __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->owner) ) #define DEFINE_DIR2_EVENT(name) \ @@ -1992,7 +2003,6 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(int, valuelen) __field(xfs_dahash_t, hashval) __field(unsigned int, attr_filter) - __field(unsigned int, attr_flags) __field(uint32_t, op_flags) ), TP_fast_assign( @@ -2004,11 +2014,10 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->valuelen = args->valuelen; __entry->hashval = args->hashval; __entry->attr_filter = args->attr_filter; - __entry->attr_flags = args->attr_flags; __entry->op_flags = args->op_flags; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " - "hashval 0x%x filter %s flags %s op_flags %s", + "hashval 0x%x filter %s op_flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -2018,9 +2027,6 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->hashval, __print_flags(__entry->attr_filter, "|", XFS_ATTR_FILTER_FLAGS), - __print_flags(__entry->attr_flags, "|", - { XATTR_CREATE, "CREATE" }, - { XATTR_REPLACE, "REPLACE" }), __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) ) @@ -3062,7 +3068,6 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent); DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical); DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); -DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error); DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); /* refcount tracepoint classes */ @@ -4770,6 +4775,419 @@ DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block); DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block); #endif /* CONFIG_XFS_BTREE_IN_MEM */ +/* exchmaps tracepoints */ +#define XFS_EXCHMAPS_STRINGS \ + { XFS_EXCHMAPS_ATTR_FORK, "ATTRFORK" }, \ + { XFS_EXCHMAPS_SET_SIZES, "SETSIZES" }, \ + { XFS_EXCHMAPS_INO1_WRITTEN, "INO1_WRITTEN" }, \ + { XFS_EXCHMAPS_CLEAR_INO1_REFLINK, "CLEAR_INO1_REFLINK" }, \ + { XFS_EXCHMAPS_CLEAR_INO2_REFLINK, "CLEAR_INO2_REFLINK" }, \ + { __XFS_EXCHMAPS_INO2_SHORTFORM, "INO2_SF" } + +DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping1_skip); +DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping1); +DEFINE_INODE_IREC_EVENT(xfs_exchmaps_mapping2); +DEFINE_ITRUNC_EVENT(xfs_exchmaps_update_inode_size); + +#define XFS_EXCHRANGE_INODES \ + { 1, "file1" }, \ + { 2, "file2" } + +DECLARE_EVENT_CLASS(xfs_exchrange_inode_class, + TP_PROTO(struct xfs_inode *ip, int whichfile), + TP_ARGS(ip, whichfile), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, whichfile) + __field(xfs_ino_t, ino) + __field(int, format) + __field(xfs_extnum_t, nex) + __field(int, broot_size) + __field(int, fork_off) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->whichfile = whichfile; + __entry->ino = ip->i_ino; + __entry->format = ip->i_df.if_format; + __entry->nex = ip->i_df.if_nextents; + __entry->fork_off = xfs_inode_fork_boff(ip); + ), + TP_printk("dev %d:%d ino 0x%llx whichfile %s format %s num_extents %llu forkoff 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfile, XFS_EXCHRANGE_INODES), + __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), + __entry->nex, + __entry->fork_off) +) + +#define DEFINE_EXCHRANGE_INODE_EVENT(name) \ +DEFINE_EVENT(xfs_exchrange_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip, int whichfile), \ + TP_ARGS(ip, whichfile)) + +DEFINE_EXCHRANGE_INODE_EVENT(xfs_exchrange_before); +DEFINE_EXCHRANGE_INODE_EVENT(xfs_exchrange_after); +DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error); + +#define XFS_EXCHANGE_RANGE_FLAGS_STRS \ + { XFS_EXCHANGE_RANGE_TO_EOF, "TO_EOF" }, \ + { XFS_EXCHANGE_RANGE_DSYNC , "DSYNC" }, \ + { XFS_EXCHANGE_RANGE_DRY_RUN, "DRY_RUN" }, \ + { XFS_EXCHANGE_RANGE_FILE1_WRITTEN, "F1_WRITTEN" }, \ + { __XFS_EXCHANGE_RANGE_UPD_CMTIME1, "CMTIME1" }, \ + { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" } + +/* file exchange-range tracepoint class */ +DECLARE_EVENT_CLASS(xfs_exchrange_class, + TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip1, + struct xfs_inode *ip2), + TP_ARGS(fxr, ip1, ip2), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ip1_ino) + __field(loff_t, ip1_isize) + __field(loff_t, ip1_disize) + __field(xfs_ino_t, ip2_ino) + __field(loff_t, ip2_isize) + __field(loff_t, ip2_disize) + + __field(loff_t, file1_offset) + __field(loff_t, file2_offset) + __field(unsigned long long, length) + __field(unsigned long long, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip1)->i_sb->s_dev; + __entry->ip1_ino = ip1->i_ino; + __entry->ip1_isize = VFS_I(ip1)->i_size; + __entry->ip1_disize = ip1->i_disk_size; + __entry->ip2_ino = ip2->i_ino; + __entry->ip2_isize = VFS_I(ip2)->i_size; + __entry->ip2_disize = ip2->i_disk_size; + + __entry->file1_offset = fxr->file1_offset; + __entry->file2_offset = fxr->file2_offset; + __entry->length = fxr->length; + __entry->flags = fxr->flags; + ), + TP_printk("dev %d:%d flags %s bytecount 0x%llx " + "ino1 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> " + "ino2 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags_u64(__entry->flags, "|", XFS_EXCHANGE_RANGE_FLAGS_STRS), + __entry->length, + __entry->ip1_ino, + __entry->ip1_isize, + __entry->ip1_disize, + __entry->file1_offset, + __entry->ip2_ino, + __entry->ip2_isize, + __entry->ip2_disize, + __entry->file2_offset) +) + +#define DEFINE_EXCHRANGE_EVENT(name) \ +DEFINE_EVENT(xfs_exchrange_class, name, \ + TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip1, \ + struct xfs_inode *ip2), \ + TP_ARGS(fxr, ip1, ip2)) +DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep); +DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush); +DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings); + +TRACE_EVENT(xfs_exchmaps_overhead, + TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks, + unsigned long long rmapbt_blocks), + TP_ARGS(mp, bmbt_blocks, rmapbt_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, bmbt_blocks) + __field(unsigned long long, rmapbt_blocks) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->bmbt_blocks = bmbt_blocks; + __entry->rmapbt_blocks = rmapbt_blocks; + ), + TP_printk("dev %d:%d bmbt_blocks 0x%llx rmapbt_blocks 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->bmbt_blocks, + __entry->rmapbt_blocks) +); + +DECLARE_EVENT_CLASS(xfs_exchmaps_estimate_class, + TP_PROTO(const struct xfs_exchmaps_req *req), + TP_ARGS(req), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino1) + __field(xfs_ino_t, ino2) + __field(xfs_fileoff_t, startoff1) + __field(xfs_fileoff_t, startoff2) + __field(xfs_filblks_t, blockcount) + __field(uint64_t, flags) + __field(xfs_filblks_t, ip1_bcount) + __field(xfs_filblks_t, ip2_bcount) + __field(xfs_filblks_t, ip1_rtbcount) + __field(xfs_filblks_t, ip2_rtbcount) + __field(unsigned long long, resblks) + __field(unsigned long long, nr_exchanges) + ), + TP_fast_assign( + __entry->dev = req->ip1->i_mount->m_super->s_dev; + __entry->ino1 = req->ip1->i_ino; + __entry->ino2 = req->ip2->i_ino; + __entry->startoff1 = req->startoff1; + __entry->startoff2 = req->startoff2; + __entry->blockcount = req->blockcount; + __entry->flags = req->flags; + __entry->ip1_bcount = req->ip1_bcount; + __entry->ip2_bcount = req->ip2_bcount; + __entry->ip1_rtbcount = req->ip1_rtbcount; + __entry->ip2_rtbcount = req->ip2_rtbcount; + __entry->resblks = req->resblks; + __entry->nr_exchanges = req->nr_exchanges; + ), + TP_printk("dev %d:%d ino1 0x%llx fileoff1 0x%llx ino2 0x%llx fileoff2 0x%llx fsbcount 0x%llx flags (%s) bcount1 0x%llx rtbcount1 0x%llx bcount2 0x%llx rtbcount2 0x%llx resblks 0x%llx nr_exchanges %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino1, __entry->startoff1, + __entry->ino2, __entry->startoff2, + __entry->blockcount, + __print_flags_u64(__entry->flags, "|", XFS_EXCHMAPS_STRINGS), + __entry->ip1_bcount, + __entry->ip1_rtbcount, + __entry->ip2_bcount, + __entry->ip2_rtbcount, + __entry->resblks, + __entry->nr_exchanges) +); + +#define DEFINE_EXCHMAPS_ESTIMATE_EVENT(name) \ +DEFINE_EVENT(xfs_exchmaps_estimate_class, name, \ + TP_PROTO(const struct xfs_exchmaps_req *req), \ + TP_ARGS(req)) +DEFINE_EXCHMAPS_ESTIMATE_EVENT(xfs_exchmaps_initial_estimate); +DEFINE_EXCHMAPS_ESTIMATE_EVENT(xfs_exchmaps_final_estimate); + +DECLARE_EVENT_CLASS(xfs_exchmaps_intent_class, + TP_PROTO(struct xfs_mount *mp, const struct xfs_exchmaps_intent *xmi), + TP_ARGS(mp, xmi), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino1) + __field(xfs_ino_t, ino2) + __field(uint64_t, flags) + __field(xfs_fileoff_t, startoff1) + __field(xfs_fileoff_t, startoff2) + __field(xfs_filblks_t, blockcount) + __field(xfs_fsize_t, isize1) + __field(xfs_fsize_t, isize2) + __field(xfs_fsize_t, new_isize1) + __field(xfs_fsize_t, new_isize2) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino1 = xmi->xmi_ip1->i_ino; + __entry->ino2 = xmi->xmi_ip2->i_ino; + __entry->flags = xmi->xmi_flags; + __entry->startoff1 = xmi->xmi_startoff1; + __entry->startoff2 = xmi->xmi_startoff2; + __entry->blockcount = xmi->xmi_blockcount; + __entry->isize1 = xmi->xmi_ip1->i_disk_size; + __entry->isize2 = xmi->xmi_ip2->i_disk_size; + __entry->new_isize1 = xmi->xmi_isize1; + __entry->new_isize2 = xmi->xmi_isize2; + ), + TP_printk("dev %d:%d ino1 0x%llx fileoff1 0x%llx ino2 0x%llx fileoff2 0x%llx fsbcount 0x%llx flags (%s) isize1 0x%llx newisize1 0x%llx isize2 0x%llx newisize2 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino1, __entry->startoff1, + __entry->ino2, __entry->startoff2, + __entry->blockcount, + __print_flags_u64(__entry->flags, "|", XFS_EXCHMAPS_STRINGS), + __entry->isize1, __entry->new_isize1, + __entry->isize2, __entry->new_isize2) +); + +#define DEFINE_EXCHMAPS_INTENT_EVENT(name) \ +DEFINE_EVENT(xfs_exchmaps_intent_class, name, \ + TP_PROTO(struct xfs_mount *mp, const struct xfs_exchmaps_intent *xmi), \ + TP_ARGS(mp, xmi)) +DEFINE_EXCHMAPS_INTENT_EVENT(xfs_exchmaps_defer); +DEFINE_EXCHMAPS_INTENT_EVENT(xfs_exchmaps_recover); + +TRACE_EVENT(xfs_exchmaps_delta_nextents_step, + TP_PROTO(struct xfs_mount *mp, + const struct xfs_bmbt_irec *left, + const struct xfs_bmbt_irec *curr, + const struct xfs_bmbt_irec *new, + const struct xfs_bmbt_irec *right, + int delta, unsigned int state), + TP_ARGS(mp, left, curr, new, right, delta, state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_fileoff_t, loff) + __field(xfs_fsblock_t, lstart) + __field(xfs_filblks_t, lcount) + __field(xfs_fileoff_t, coff) + __field(xfs_fsblock_t, cstart) + __field(xfs_filblks_t, ccount) + __field(xfs_fileoff_t, noff) + __field(xfs_fsblock_t, nstart) + __field(xfs_filblks_t, ncount) + __field(xfs_fileoff_t, roff) + __field(xfs_fsblock_t, rstart) + __field(xfs_filblks_t, rcount) + __field(int, delta) + __field(unsigned int, state) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->loff = left->br_startoff; + __entry->lstart = left->br_startblock; + __entry->lcount = left->br_blockcount; + __entry->coff = curr->br_startoff; + __entry->cstart = curr->br_startblock; + __entry->ccount = curr->br_blockcount; + __entry->noff = new->br_startoff; + __entry->nstart = new->br_startblock; + __entry->ncount = new->br_blockcount; + __entry->roff = right->br_startoff; + __entry->rstart = right->br_startblock; + __entry->rcount = right->br_blockcount; + __entry->delta = delta; + __entry->state = state; + ), + TP_printk("dev %d:%d left 0x%llx:0x%llx:0x%llx; curr 0x%llx:0x%llx:0x%llx <- new 0x%llx:0x%llx:0x%llx; right 0x%llx:0x%llx:0x%llx delta %d state 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->loff, __entry->lstart, __entry->lcount, + __entry->coff, __entry->cstart, __entry->ccount, + __entry->noff, __entry->nstart, __entry->ncount, + __entry->roff, __entry->rstart, __entry->rcount, + __entry->delta, __entry->state) +); + +TRACE_EVENT(xfs_exchmaps_delta_nextents, + TP_PROTO(const struct xfs_exchmaps_req *req, int64_t d_nexts1, + int64_t d_nexts2), + TP_ARGS(req, d_nexts1, d_nexts2), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino1) + __field(xfs_ino_t, ino2) + __field(xfs_extnum_t, nexts1) + __field(xfs_extnum_t, nexts2) + __field(int64_t, d_nexts1) + __field(int64_t, d_nexts2) + ), + TP_fast_assign( + int whichfork = xfs_exchmaps_reqfork(req); + + __entry->dev = req->ip1->i_mount->m_super->s_dev; + __entry->ino1 = req->ip1->i_ino; + __entry->ino2 = req->ip2->i_ino; + __entry->nexts1 = xfs_ifork_ptr(req->ip1, whichfork)->if_nextents; + __entry->nexts2 = xfs_ifork_ptr(req->ip2, whichfork)->if_nextents; + __entry->d_nexts1 = d_nexts1; + __entry->d_nexts2 = d_nexts2; + ), + TP_printk("dev %d:%d ino1 0x%llx nexts %llu ino2 0x%llx nexts %llu delta1 %lld delta2 %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino1, __entry->nexts1, + __entry->ino2, __entry->nexts2, + __entry->d_nexts1, __entry->d_nexts2) +); + +DECLARE_EVENT_CLASS(xfs_getparents_rec_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, + const struct xfs_attr_list_context *context, + const struct xfs_getparents_rec *pptr), + TP_ARGS(ip, ppi, context, pptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, firstu) + __field(unsigned short, reclen) + __field(unsigned int, bufsize) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, parent_gen) + __string(name, pptr->gpr_name) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->firstu = context->firstu; + __entry->reclen = pptr->gpr_reclen; + __entry->bufsize = ppi->gp_bufsize; + __entry->parent_ino = pptr->gpr_parent.ha_fid.fid_ino; + __entry->parent_gen = pptr->gpr_parent.ha_fid.fid_gen; + __assign_str(name, pptr->gpr_name); + ), + TP_printk("dev %d:%d ino 0x%llx firstu %u reclen %u bufsize %u parent_ino 0x%llx parent_gen 0x%x name '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->firstu, + __entry->reclen, + __entry->bufsize, + __entry->parent_ino, + __entry->parent_gen, + __get_str(name)) +) +#define DEFINE_XFS_GETPARENTS_REC_EVENT(name) \ +DEFINE_EVENT(xfs_getparents_rec_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, \ + const struct xfs_attr_list_context *context, \ + const struct xfs_getparents_rec *pptr), \ + TP_ARGS(ip, ppi, context, pptr)) +DEFINE_XFS_GETPARENTS_REC_EVENT(xfs_getparents_put_listent); +DEFINE_XFS_GETPARENTS_REC_EVENT(xfs_getparents_expand_lastrec); + +DECLARE_EVENT_CLASS(xfs_getparents_class, + TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, + const struct xfs_attrlist_cursor_kern *cur), + TP_ARGS(ip, ppi, cur), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned short, iflags) + __field(unsigned short, oflags) + __field(unsigned int, bufsize) + __field(unsigned int, hashval) + __field(unsigned int, blkno) + __field(unsigned int, offset) + __field(int, initted) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->iflags = ppi->gp_iflags; + __entry->oflags = ppi->gp_oflags; + __entry->bufsize = ppi->gp_bufsize; + __entry->hashval = cur->hashval; + __entry->blkno = cur->blkno; + __entry->offset = cur->offset; + __entry->initted = cur->initted; + ), + TP_printk("dev %d:%d ino 0x%llx iflags 0x%x oflags 0x%x bufsize %u cur_init? %d hashval 0x%x blkno %u offset %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->iflags, + __entry->oflags, + __entry->bufsize, + __entry->initted, + __entry->hashval, + __entry->blkno, + __entry->offset) +) +#define DEFINE_XFS_GETPARENTS_EVENT(name) \ +DEFINE_EVENT(xfs_getparents_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct xfs_getparents *ppi, \ + const struct xfs_attrlist_cursor_kern *cur), \ + TP_ARGS(ip, ppi, cur)) +DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_begin); +DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_end); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 7350640059cc..828da4ac4316 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -163,7 +163,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); + error = xfs_dec_fdblocks(mp, blocks, rsvd); if (error != 0) return -ENOSPC; tp->t_blk_res += blocks; @@ -210,7 +210,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_frextents(mp, -((int64_t)rtextents)); + error = xfs_dec_frextents(mp, rtextents); if (error) { error = -ENOSPC; goto undo_log; @@ -234,7 +234,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); + xfs_add_fdblocks(mp, blocks); tp->t_blk_res = 0; } return error; @@ -593,38 +593,44 @@ xfs_trans_unreserve_and_mod_sb( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; - bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; - int64_t blkdelta = 0; - int64_t rtxdelta = 0; + int64_t blkdelta = tp->t_blk_res; + int64_t rtxdelta = tp->t_rtx_res; int64_t idelta = 0; int64_t ifreedelta = 0; - int error; - /* calculate deltas */ - if (tp->t_blk_res > 0) - blkdelta = tp->t_blk_res; - if ((tp->t_fdblocks_delta != 0) && - (xfs_has_lazysbcount(mp) || - (tp->t_flags & XFS_TRANS_SB_DIRTY))) + /* + * Calculate the deltas. + * + * t_fdblocks_delta and t_frextents_delta can be positive or negative: + * + * - positive values indicate blocks freed in the transaction. + * - negative values indicate blocks allocated in the transaction + * + * Negative values can only happen if the transaction has a block + * reservation that covers the allocated block. The end result is + * that the calculated delta values must always be positive and we + * can only put back previous allocated or reserved blocks here. + */ + ASSERT(tp->t_blk_res || tp->t_fdblocks_delta >= 0); + if (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) { blkdelta += tp->t_fdblocks_delta; + ASSERT(blkdelta >= 0); + } - if (tp->t_rtx_res > 0) - rtxdelta = tp->t_rtx_res; - if ((tp->t_frextents_delta != 0) && - (tp->t_flags & XFS_TRANS_SB_DIRTY)) + ASSERT(tp->t_rtx_res || tp->t_frextents_delta >= 0); + if (tp->t_flags & XFS_TRANS_SB_DIRTY) { rtxdelta += tp->t_frextents_delta; + ASSERT(rtxdelta >= 0); + } - if (xfs_has_lazysbcount(mp) || - (tp->t_flags & XFS_TRANS_SB_DIRTY)) { + if (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) { idelta = tp->t_icount_delta; ifreedelta = tp->t_ifree_delta; } /* apply the per-cpu counters */ - if (blkdelta) { - error = xfs_mod_fdblocks(mp, blkdelta, rsvd); - ASSERT(!error); - } + if (blkdelta) + xfs_add_fdblocks(mp, blkdelta); if (idelta) percpu_counter_add_batch(&mp->m_icount, idelta, @@ -633,10 +639,8 @@ xfs_trans_unreserve_and_mod_sb( if (ifreedelta) percpu_counter_add(&mp->m_ifree, ifreedelta); - if (rtxdelta) { - error = xfs_mod_frextents(mp, rtxdelta); - ASSERT(!error); - } + if (rtxdelta) + xfs_add_frextents(mp, rtxdelta); if (!(tp->t_flags & XFS_TRANS_SB_DIRTY)) return; @@ -672,7 +676,6 @@ xfs_trans_unreserve_and_mod_sb( */ ASSERT(mp->m_sb.sb_imax_pct >= 0); ASSERT(mp->m_sb.sb_rextslog >= 0); - return; } /* Add the given log item to the transaction's list of log items. */ @@ -1291,9 +1294,9 @@ xfs_trans_reserve_more_inode( return 0; /* Quota failed, give back the new reservation. */ - xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE); + xfs_add_fdblocks(mp, dblocks); tp->t_blk_res -= dblocks; - xfs_mod_frextents(mp, rtx); + xfs_add_frextents(mp, rtx); tp->t_rtx_res -= rtx; return error; } @@ -1430,6 +1433,8 @@ out_cancel: * The caller must ensure that the on-disk dquots attached to this inode have * already been allocated and initialized. The ILOCKs will be dropped when the * transaction is committed or cancelled. + * + * Caller is responsible for unlocking the inodes manually upon return */ int xfs_trans_alloc_dir( @@ -1460,8 +1465,8 @@ retry: xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, 0); + xfs_trans_ijoin(tp, ip, 0); error = xfs_qm_dqattach_locked(dp, false); if (error) { @@ -1484,6 +1489,9 @@ retry: if (error == -EDQUOT || error == -ENOSPC) { if (!retried) { xfs_trans_cancel(tp); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + if (dp != ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_blockgc_free_quota(dp, 0); retried = true; goto retry; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 577b535a595c..b368e13424c4 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -379,24 +379,29 @@ xfs_trans_mod_dquot( /* * Given an array of dqtrx structures, lock all the dquots associated and join - * them to the transaction, provided they have been modified. We know that the - * highest number of dquots of one type - usr, grp and prj - involved in a - * transaction is 3 so we don't need to make this very generic. + * them to the transaction, provided they have been modified. */ STATIC void xfs_trans_dqlockedjoin( struct xfs_trans *tp, struct xfs_dqtrx *q) { + unsigned int i; ASSERT(q[0].qt_dquot != NULL); if (q[1].qt_dquot == NULL) { xfs_dqlock(q[0].qt_dquot); xfs_trans_dqjoin(tp, q[0].qt_dquot); - } else { - ASSERT(XFS_QM_TRANS_MAXDQS == 2); + } else if (q[2].qt_dquot == NULL) { xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot); xfs_trans_dqjoin(tp, q[0].qt_dquot); xfs_trans_dqjoin(tp, q[1].qt_dquot); + } else { + xfs_dqlockn(q); + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + if (q[i].qt_dquot == NULL) + break; + xfs_trans_dqjoin(tp, q[i].qt_dquot); + } } } diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 364104e1b38a..ab3d22f662f2 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -17,15 +17,13 @@ #include "xfs_acl.h" #include "xfs_log.h" #include "xfs_xattr.h" +#include "xfs_quota.h" #include <linux/posix_acl_xattr.h> /* * Get permission to use log-assisted atomic exchange of file extents. - * - * Callers must not be running any transactions or hold any inode locks, and - * they must release the permission by calling xlog_drop_incompat_feat - * when they're done. + * Callers must not be running any transactions or hold any ILOCKs. */ static inline int xfs_attr_grab_log_assist( @@ -33,17 +31,8 @@ xfs_attr_grab_log_assist( { int error = 0; - /* - * Protect ourselves from an idle log clearing the logged xattrs log - * incompat feature bit. - */ - xlog_use_incompat_feat(mp->m_log); - - /* - * If log-assisted xattrs are already enabled, the caller can use the - * log assisted swap functions with the log-incompat reference we got. - */ - if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + /* xattr update log intent items are already enabled */ + if (xfs_is_using_logged_xattrs(mp)) return 0; /* @@ -52,31 +41,20 @@ xfs_attr_grab_log_assist( * a V5 filesystem for the superblock field, but we'll require rmap * or reflink to avoid having to deal with really old kernels. */ - if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) { - error = -EOPNOTSUPP; - goto drop_incompat; - } + if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; /* Enable log-assisted xattrs. */ error = xfs_add_incompat_log_feature(mp, XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); if (error) - goto drop_incompat; + return error; + xfs_set_using_logged_xattrs(mp); xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); return 0; -drop_incompat: - xlog_drop_incompat_feat(mp->m_log); - return error; -} - -static inline void -xfs_attr_rele_log_assist( - struct xfs_mount *mp) -{ - xlog_drop_incompat_feat(mp->m_log); } static inline bool @@ -93,17 +71,31 @@ xfs_attr_want_log_assist( /* * Set or remove an xattr, having grabbed the appropriate logging resources - * prior to calling libxfs. + * prior to calling libxfs. Callers of this function are only required to + * initialize the inode, attr_filter, name, namelen, value, and valuelen fields + * of @args. */ int xfs_attr_change( - struct xfs_da_args *args) + struct xfs_da_args *args, + enum xfs_attr_update op) { struct xfs_mount *mp = args->dp->i_mount; - bool use_logging = false; int error; - ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED)); + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_qm_dqattach(args->dp); + if (error) + return error; + + /* + * We have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail as well. + */ + args->op_flags = XFS_DA_OP_OKNOENT; if (xfs_attr_want_log_assist(mp)) { error = xfs_attr_grab_log_assist(mp); @@ -111,14 +103,14 @@ xfs_attr_change( return error; args->op_flags |= XFS_DA_OP_LOGGED; - use_logging = true; } - error = xfs_attr_set(args); + args->owner = args->dp->i_ino; + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + xfs_attr_sethash(args); - if (use_logging) - xfs_attr_rele_log_assist(mp); - return error; + return xfs_attr_set(args, op, args->attr_filter & XFS_ATTR_ROOT); } @@ -145,6 +137,20 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, return args.valuelen; } +static inline enum xfs_attr_update +xfs_xattr_flags_to_op( + int flags, + const void *value) +{ + if (!value) + return XFS_ATTRUPDATE_REMOVE; + if (flags & XATTR_CREATE) + return XFS_ATTRUPDATE_CREATE; + if (flags & XATTR_REPLACE) + return XFS_ATTRUPDATE_REPLACE; + return XFS_ATTRUPDATE_UPSERT; +} + static int xfs_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, @@ -154,7 +160,6 @@ xfs_xattr_set(const struct xattr_handler *handler, struct xfs_da_args args = { .dp = XFS_I(inode), .attr_filter = handler->flags, - .attr_flags = flags, .name = name, .namelen = strlen(name), .value = (void *)value, @@ -162,7 +167,7 @@ xfs_xattr_set(const struct xattr_handler *handler, }; int error; - error = xfs_attr_change(&args); + error = xfs_attr_change(&args, xfs_xattr_flags_to_op(flags, value)); if (!error && (handler->flags & XFS_ATTR_ROOT)) xfs_forget_acl(inode, name); return error; @@ -237,6 +242,7 @@ xfs_xattr_put_listent( int flags, unsigned char *name, int namelen, + void *value, int valuelen) { char *prefix; @@ -244,6 +250,10 @@ xfs_xattr_put_listent( ASSERT(context->count >= 0); + /* Don't expose private xattr namespaces. */ + if (flags & XFS_ATTR_PRIVATE_NSP_MASK) + return; + if (flags & XFS_ATTR_ROOT) { #ifdef CONFIG_XFS_POSIX_ACL if (namelen == SGI_ACL_FILE_SIZE && diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h index cec766cad26c..c3eb858fb59e 100644 --- a/fs/xfs/xfs_xattr.h +++ b/fs/xfs/xfs_xattr.h @@ -6,7 +6,8 @@ #ifndef __XFS_XATTR_H__ #define __XFS_XATTR_H__ -int xfs_attr_change(struct xfs_da_args *args); +enum xfs_attr_update; +int xfs_attr_change(struct xfs_da_args *args, enum xfs_attr_update op); extern const struct xattr_handler * const xfs_xattr_handlers[]; |