summaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_inode.c
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2020-09-01 14:19:48 +0200
committerJiri Kosina <jkosina@suse.cz>2020-09-01 14:19:48 +0200
commitead5d1f4d877e92c051e1a1ade623d0d30e71619 (patch)
treecb9db5698a546e7b96f7d5bef5ce544629dd37a2 /fs/xfs/xfs_inode.c
parentscif: Fix spelling of EACCES (diff)
parentMerge tag 'docs-5.9-3' of git://git.lwn.net/linux (diff)
downloadlinux-ead5d1f4d877e92c051e1a1ade623d0d30e71619.tar.xz
linux-ead5d1f4d877e92c051e1a1ade623d0d30e71619.zip
Merge branch 'master' into for-next
Sync with Linus' branch in order to be able to apply fixups of more recent patches.
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r--fs/xfs/xfs_inode.c1062
1 files changed, 466 insertions, 596 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c5077e6326c7..c06129cffba9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -44,7 +44,6 @@ kmem_zone_t *xfs_inode_zone;
*/
#define XFS_ITRUNC_MAX_EXTENTS 2
-STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
@@ -112,7 +111,7 @@ xfs_ilock_data_map_shared(
{
uint lock_mode = XFS_ILOCK_SHARED;
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE &&
(ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lock_mode);
@@ -125,7 +124,8 @@ xfs_ilock_attr_map_shared(
{
uint lock_mode = XFS_ILOCK_SHARED;
- if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
+ if (ip->i_afp &&
+ ip->i_afp->if_format == XFS_DINODE_FMT_BTREE &&
(ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lock_mode);
@@ -144,17 +144,17 @@ xfs_ilock_attr_map_shared(
*
* i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
*
- * mmap_sem locking order:
+ * mmap_lock locking order:
*
- * i_rwsem -> page lock -> mmap_sem
- * mmap_sem -> i_mmap_lock -> page_lock
+ * i_rwsem -> page lock -> mmap_lock
+ * mmap_lock -> i_mmap_lock -> page_lock
*
- * The difference in mmap_sem locking order mean that we cannot hold the
+ * The difference in mmap_lock locking order mean that we cannot hold the
* i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
- * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
+ * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
* in get_user_pages() to map the user pages into the kernel address space for
* direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
- * page faults already hold the mmap_sem.
+ * page faults already hold the mmap_lock.
*
* Hence to serialise fully against both syscall and mmap based IO, we need to
* take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
@@ -451,7 +451,7 @@ xfs_lock_inodes(
/*
* Currently supports between 2 and 5 inodes with exclusive locking. We
* support an arbitrary depth of locking here, but absolute limits on
- * inodes depend on the the type of locking and the limits placed by
+ * inodes depend on the type of locking and the limits placed by
* lockdep annotations in xfs_lock_inumorder. These are all checked by
* the asserts.
*/
@@ -801,26 +801,18 @@ xfs_ialloc(
return error;
ASSERT(ip != NULL);
inode = VFS_I(ip);
-
- /*
- * We always convert v1 inodes to v2 now - we only support filesystems
- * with >= v2 inode capability, so there is no reason for ever leaving
- * an inode in v1 format.
- */
- if (ip->i_d.di_version == 1)
- ip->i_d.di_version = 2;
-
inode->i_mode = mode;
set_nlink(inode, nlink);
- ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
- ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
+ inode->i_uid = current_fsuid();
inode->i_rdev = rdev;
ip->i_d.di_projid = prid;
if (pip && XFS_INHERIT_GID(pip)) {
- ip->i_d.di_gid = pip->i_d.di_gid;
+ inode->i_gid = VFS_I(pip)->i_gid;
if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
inode->i_mode |= S_ISGID;
+ } else {
+ inode->i_gid = current_fsgid();
}
/*
@@ -828,13 +820,12 @@ xfs_ialloc(
* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
- if ((irix_sgid_inherit) &&
- (inode->i_mode & S_ISGID) &&
- (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
+ if (irix_sgid_inherit &&
+ (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid))
inode->i_mode &= ~S_ISGID;
ip->i_d.di_size = 0;
- ip->i_d.di_nextents = 0;
+ ip->i_df.if_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
tv = current_time(inode);
@@ -847,21 +838,20 @@ xfs_ialloc(
ip->i_d.di_dmstate = 0;
ip->i_d.di_flags = 0;
- if (ip->i_d.di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
inode_set_iversion(inode, 1);
ip->i_d.di_flags2 = 0;
ip->i_d.di_cowextsize = 0;
ip->i_d.di_crtime = tv;
}
-
flags = XFS_ILOG_CORE;
switch (mode & S_IFMT) {
case S_IFIFO:
case S_IFCHR:
case S_IFBLK:
case S_IFSOCK:
- ip->i_d.di_format = XFS_DINODE_FMT_DEV;
+ ip->i_df.if_format = XFS_DINODE_FMT_DEV;
ip->i_df.if_flags = 0;
flags |= XFS_ILOG_DEV;
break;
@@ -907,24 +897,17 @@ xfs_ialloc(
ip->i_d.di_flags |= di_flags;
}
- if (pip &&
- (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
- pip->i_d.di_version == 3 &&
- ip->i_d.di_version == 3) {
- uint64_t di_flags2 = 0;
-
+ if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) {
if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
- di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
}
if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
- di_flags2 |= XFS_DIFLAG2_DAX;
-
- ip->i_d.di_flags2 |= di_flags2;
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
}
/* FALLTHROUGH */
case S_IFLNK:
- ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
ip->i_df.if_flags = XFS_IFEXTENTS;
ip->i_df.if_bytes = 0;
ip->i_df.if_u1.if_root = NULL;
@@ -932,11 +915,6 @@ xfs_ialloc(
default:
ASSERT(0);
}
- /*
- * Attribute fork settings for new inode.
- */
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- ip->i_d.di_anextents = 0;
/*
* Log the new values stuffed into the inode.
@@ -1122,7 +1100,6 @@ xfs_bumplink(
{
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- ASSERT(ip->i_d.di_version > 1);
inc_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -1158,8 +1135,7 @@ xfs_create(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1219,8 +1195,7 @@ xfs_create(
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
- resblks ?
- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+ resblks - XFS_IALLOC_SPACE_RES(mp));
if (error) {
ASSERT(error != -ENOSPC);
goto out_trans_cancel;
@@ -1309,8 +1284,7 @@ xfs_create_tmpfile(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1655,7 +1629,7 @@ xfs_release(
return 0;
/*
* If we can't get the iolock just skip truncating the blocks
- * past EOF because we could deadlock with the mmap_sem
+ * past EOF because we could deadlock with the mmap_lock
* otherwise. We'll get another chance to drop them once the
* last reference to the inode is dropped, so we'll never leak
* blocks permanently.
@@ -1707,7 +1681,7 @@ xfs_inactive_truncate(
if (error)
goto error_trans_cancel;
- ASSERT(ip->i_d.di_nextents == 0);
+ ASSERT(ip->i_df.if_nextents == 0);
error = xfs_trans_commit(tp);
if (error)
@@ -1765,10 +1739,31 @@ xfs_inactive_ifree(
return error;
}
+ /*
+ * We do not hold the inode locked across the entire rolling transaction
+ * here. We only need to hold it for the first transaction that
+ * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
+ * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
+ * here breaks the relationship between cluster buffer invalidation and
+ * stale inode invalidation on cluster buffer item journal commit
+ * completion, and can result in leaving dirty stale inodes hanging
+ * around in memory.
+ *
+ * We have no need for serialising this inode operation against other
+ * operations - we freed the inode and hence reallocation is required
+ * and that will serialise on reallocating the space the deferops need
+ * to free. Hence we can unlock the inode on the first commit of
+ * the transaction rather than roll it right through the deferops. This
+ * avoids relogging the XFS_ISTALE inode.
+ *
+ * We check that xfs_ifree() hasn't grown an internal transaction roll
+ * by asserting that the inode is still locked when it returns.
+ */
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
error = xfs_ifree(tp, ip);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (error) {
/*
* If we fail to free the inode, shut down. The cancel
@@ -1781,7 +1776,6 @@ xfs_inactive_ifree(
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
}
xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1799,7 +1793,6 @@ xfs_inactive_ifree(
xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
__func__, error);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
}
@@ -1857,7 +1850,7 @@ xfs_inactive(
if (S_ISREG(VFS_I(ip)->i_mode) &&
(ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
- ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
+ ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
error = xfs_qm_dqattach(ip);
@@ -1883,7 +1876,6 @@ xfs_inactive(
}
ASSERT(!ip->i_afp);
- ASSERT(ip->i_d.di_anextents == 0);
ASSERT(ip->i_d.di_forkoff == 0);
/*
@@ -2119,7 +2111,7 @@ xfs_iunlink_update_bucket(
unsigned int bucket_index,
xfs_agino_t new_agino)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ struct xfs_agi *agi = agibp->b_addr;
xfs_agino_t old_value;
int offset;
@@ -2135,7 +2127,7 @@ xfs_iunlink_update_bucket(
* head of the list.
*/
if (old_value == new_agino) {
- xfs_buf_corruption_error(agibp);
+ xfs_buf_mark_corrupt(agibp);
return -EFSCORRUPTED;
}
@@ -2173,7 +2165,6 @@ xfs_iunlink_update_dinode(
xfs_dinode_calc_crc(mp, dip);
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
- xfs_inobp_check(mp, ibp);
}
/* Set an in-core inode's unlinked pointer and return the old value. */
@@ -2193,7 +2184,7 @@ xfs_iunlink_update_inode(
ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0);
if (error)
return error;
@@ -2259,7 +2250,7 @@ xfs_iunlink(
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
/*
* Get the index into the agi hash table for the list this inode will
@@ -2269,12 +2260,11 @@ xfs_iunlink(
next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
if (next_agino == agino ||
!xfs_verify_agino_or_null(mp, agno, next_agino)) {
- xfs_buf_corruption_error(agibp);
+ xfs_buf_mark_corrupt(agibp);
return -EFSCORRUPTED;
}
if (next_agino != NULLAGINO) {
- struct xfs_perag *pag;
xfs_agino_t old_agino;
/*
@@ -2291,9 +2281,7 @@ xfs_iunlink(
* agino has been unlinked, add a backref from the next inode
* back to agino.
*/
- pag = xfs_perag_get(mp, agno);
- error = xfs_iunlink_add_backref(pag, agino, next_agino);
- xfs_perag_put(pag);
+ error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
if (error)
return error;
}
@@ -2323,7 +2311,7 @@ xfs_iunlink_map_ino(
return error;
}
- error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+ error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0);
if (error) {
xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
__func__, error);
@@ -2429,7 +2417,6 @@ xfs_iunlink_remove(
struct xfs_buf *agibp;
struct xfs_buf *last_ibp;
struct xfs_dinode *last_dip = NULL;
- struct xfs_perag *pag = NULL;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
xfs_agino_t next_agino;
@@ -2443,7 +2430,7 @@ xfs_iunlink_remove(
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
/*
* Get the index into the agi hash table for the list this inode will
@@ -2473,32 +2460,22 @@ xfs_iunlink_remove(
* this inode's backref to point from the next inode.
*/
if (next_agino != NULLAGINO) {
- pag = xfs_perag_get(mp, agno);
- error = xfs_iunlink_change_backref(pag, next_agino,
+ error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
NULLAGINO);
if (error)
- goto out;
+ return error;
}
- if (head_agino == agino) {
- /* Point the head of the list to the next unlinked inode. */
- error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
- next_agino);
- if (error)
- goto out;
- } else {
+ if (head_agino != agino) {
struct xfs_imap imap;
xfs_agino_t prev_agino;
- if (!pag)
- pag = xfs_perag_get(mp, agno);
-
/* We need to search the list for the inode being freed. */
error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
&prev_agino, &imap, &last_dip, &last_ibp,
- pag);
+ agibp->b_pag);
if (error)
- goto out;
+ return error;
/* Point the previous inode on the list to the next inode. */
xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
@@ -2512,15 +2489,106 @@ xfs_iunlink_remove(
* change_backref takes care of deleting the backref if
* next_agino is NULLAGINO.
*/
- error = xfs_iunlink_change_backref(pag, agino, next_agino);
- if (error)
- goto out;
+ return xfs_iunlink_change_backref(agibp->b_pag, agino,
+ next_agino);
}
-out:
- if (pag)
- xfs_perag_put(pag);
- return error;
+ /* Point the head of the list to the next unlinked inode. */
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+ next_agino);
+}
+
+/*
+ * Look up the inode number specified and if it is not already marked XFS_ISTALE
+ * mark it stale. We should only find clean inodes in this lookup that aren't
+ * already stale.
+ */
+static void
+xfs_ifree_mark_inode_stale(
+ struct xfs_buf *bp,
+ struct xfs_inode *free_ip,
+ xfs_ino_t inum)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_perag *pag = bp->b_pag;
+ struct xfs_inode_log_item *iip;
+ struct xfs_inode *ip;
+
+retry:
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
+
+ /* Inode not in memory, nothing to do */
+ if (!ip) {
+ rcu_read_unlock();
+ return;
+ }
+
+ /*
+ * because this is an RCU protected lookup, we could find a recently
+ * freed or even reallocated inode during the lookup. We need to check
+ * under the i_flags_lock for a valid inode here. Skip it if it is not
+ * valid, the wrong inode or stale.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ return;
+ }
+
+ /*
+ * Don't try to lock/unlock the current inode, but we _cannot_ skip the
+ * other inodes that we did not find in the list attached to the buffer
+ * and are not already marked stale. If we can't lock it, back off and
+ * retry.
+ */
+ if (ip != free_ip) {
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ delay(1);
+ goto retry;
+ }
+ }
+ ip->i_flags |= XFS_ISTALE;
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ /*
+ * If we can't get the flush lock, the inode is already attached. All
+ * we needed to do here is mark the inode stale so buffer IO completion
+ * will remove it from the AIL.
+ */
+ iip = ip->i_itemp;
+ if (!xfs_iflock_nowait(ip)) {
+ ASSERT(!list_empty(&iip->ili_item.li_bio_list));
+ ASSERT(iip->ili_last_fields);
+ goto out_iunlock;
+ }
+
+ /*
+ * Inodes not attached to the buffer can be released immediately.
+ * Everything else has to go through xfs_iflush_abort() on journal
+ * commit as the flock synchronises removal of the inode from the
+ * cluster buffer against inode reclaim.
+ */
+ if (!iip || list_empty(&iip->ili_item.li_bio_list)) {
+ xfs_ifunlock(ip);
+ goto out_iunlock;
+ }
+
+ /* we have a dirty inode in memory that has not yet been flushed. */
+ spin_lock(&iip->ili_lock);
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
+ spin_unlock(&iip->ili_lock);
+ ASSERT(iip->ili_last_fields);
+
+out_iunlock:
+ if (ip != free_ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
/*
@@ -2530,26 +2598,20 @@ out:
*/
STATIC int
xfs_ifree_cluster(
- xfs_inode_t *free_ip,
- xfs_trans_t *tp,
+ struct xfs_inode *free_ip,
+ struct xfs_trans *tp,
struct xfs_icluster *xic)
{
- xfs_mount_t *mp = free_ip->i_mount;
+ struct xfs_mount *mp = free_ip->i_mount;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ struct xfs_buf *bp;
+ xfs_daddr_t blkno;
+ xfs_ino_t inum = xic->first_ino;
int nbufs;
int i, j;
int ioffset;
- xfs_daddr_t blkno;
- xfs_buf_t *bp;
- xfs_inode_t *ip;
- xfs_inode_log_item_t *iip;
- struct xfs_log_item *lip;
- struct xfs_perag *pag;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
- xfs_ino_t inum;
int error;
- inum = xic->first_ino;
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
@@ -2593,148 +2655,20 @@ xfs_ifree_cluster(
bp->b_ops = &xfs_inode_buf_ops;
/*
- * Walk the inodes already attached to the buffer and mark them
- * stale. These will all have the flush locks held, so an
- * in-memory inode walk can't lock them. By marking them all
- * stale first, we will not attempt to lock them in the loop
- * below as the XFS_ISTALE flag will be set.
+ * Now we need to set all the cached clean inodes as XFS_ISTALE,
+ * too. This requires lookups, and will skip inodes that we've
+ * already marked XFS_ISTALE.
*/
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- if (lip->li_type == XFS_LI_INODE) {
- iip = (xfs_inode_log_item_t *)lip;
- ASSERT(iip->ili_logged == 1);
- lip->li_cb = xfs_istale_done;
- xfs_trans_ail_copy_lsn(mp->m_ail,
- &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
- xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
- }
- }
-
-
- /*
- * For each inode in memory attempt to add it to the inode
- * buffer and set it up for being staled on buffer IO
- * completion. This is safe as we've locked out tail pushing
- * and flushing by locking the buffer.
- *
- * We have already marked every inode that was part of a
- * transaction stale above, which means there is no point in
- * even trying to lock them.
- */
- for (i = 0; i < igeo->inodes_per_cluster; i++) {
-retry:
- rcu_read_lock();
- ip = radix_tree_lookup(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, (inum + i)));
-
- /* Inode not in memory, nothing to do */
- if (!ip) {
- rcu_read_unlock();
- continue;
- }
-
- /*
- * because this is an RCU protected lookup, we could
- * find a recently freed or even reallocated inode
- * during the lookup. We need to check under the
- * i_flags_lock for a valid inode here. Skip it if it
- * is not valid, the wrong inode or stale.
- */
- spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != inum + i ||
- __xfs_iflags_test(ip, XFS_ISTALE)) {
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- continue;
- }
- spin_unlock(&ip->i_flags_lock);
-
- /*
- * Don't try to lock/unlock the current inode, but we
- * _cannot_ skip the other inodes that we did not find
- * in the list attached to the buffer and are not
- * already marked stale. If we can't lock it, back off
- * and retry.
- */
- if (ip != free_ip) {
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- rcu_read_unlock();
- delay(1);
- goto retry;
- }
-
- /*
- * Check the inode number again in case we're
- * racing with freeing in xfs_reclaim_inode().
- * See the comments in that function for more
- * information as to why the initial check is
- * not sufficient.
- */
- if (ip->i_ino != inum + i) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- rcu_read_unlock();
- continue;
- }
- }
- rcu_read_unlock();
-
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_ISTALE);
-
- /*
- * we don't need to attach clean inodes or those only
- * with unlogged changes (which we throw away, anyway).
- */
- iip = ip->i_itemp;
- if (!iip || xfs_inode_clean(ip)) {
- ASSERT(ip != free_ip);
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- continue;
- }
-
- iip->ili_last_fields = iip->ili_fields;
- iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
- iip->ili_logged = 1;
- xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
-
- xfs_buf_attach_iodone(bp, xfs_istale_done,
- &iip->ili_item);
-
- if (ip != free_ip)
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
+ for (i = 0; i < igeo->inodes_per_cluster; i++)
+ xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
xfs_trans_stale_inode_buf(tp, bp);
xfs_trans_binval(tp, bp);
}
-
- xfs_perag_put(pag);
return 0;
}
/*
- * Free any local-format buffers sitting around before we reset to
- * extents format.
- */
-static inline void
-xfs_ifree_local_data(
- struct xfs_inode *ip,
- int whichfork)
-{
- struct xfs_ifork *ifp;
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
- return;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
-}
-
-/*
* This is called to return an inode to the inode free list.
* The inode should already be truncated to 0 length and have
* no pages associated with it. This routine also assumes that
@@ -2751,11 +2685,11 @@ xfs_ifree(
{
int error;
struct xfs_icluster xic = { 0 };
+ struct xfs_inode_log_item *iip = ip->i_itemp;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(VFS_I(ip)->i_nlink == 0);
- ASSERT(ip->i_d.di_nextents == 0);
- ASSERT(ip->i_d.di_anextents == 0);
+ ASSERT(ip->i_df.if_nextents == 0);
ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
ASSERT(ip->i_d.di_nblocks == 0);
@@ -2770,19 +2704,28 @@ xfs_ifree(
if (error)
return error;
- xfs_ifree_local_data(ip, XFS_DATA_FORK);
- xfs_ifree_local_data(ip, XFS_ATTR_FORK);
+ /*
+ * Free any local-format data sitting around before we reset the
+ * data fork to extents format. Note that the attr fork data has
+ * already been freed by xfs_attr_inactive.
+ */
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ kmem_free(ip->i_df.if_u1.if_data);
+ ip->i_df.if_u1.if_data = NULL;
+ ip->i_df.if_bytes = 0;
+ }
VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
ip->i_d.di_flags2 = 0;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
- ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
/* Don't attempt to replay owner changes for a deleted inode */
- ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER);
+ spin_lock(&iip->ili_lock);
+ iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
+ spin_unlock(&iip->ili_lock);
/*
* Bump the generation count so no one will be confused
@@ -3162,7 +3105,7 @@ out_trans_abort:
/*
* xfs_rename_alloc_whiteout()
*
- * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * Return a referenced, unlinked, unlocked inode that can be used as a
* whiteout in a rename transaction. We use a tmpfile inode here so that if we
* crash between allocating the inode and linking it into the rename transaction
* recovery will free the inode and we won't leak it.
@@ -3489,374 +3432,76 @@ out_release_wip:
return error;
}
-STATIC int
-xfs_iflush_cluster(
- struct xfs_inode *ip,
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
- unsigned long first_index, mask;
- int cilist_size;
- struct xfs_inode **cilist;
- struct xfs_inode *cip;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
- int nr_found;
- int clcount = 0;
- int i;
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
- cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
- cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
- if (!cilist)
- goto out_put;
-
- mask = ~(igeo->inodes_per_cluster - 1);
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
- rcu_read_lock();
- /* really need a gang lookup range call here */
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
- first_index, igeo->inodes_per_cluster);
- if (nr_found == 0)
- goto out_free;
-
- for (i = 0; i < nr_found; i++) {
- cip = cilist[i];
- if (cip == ip)
- continue;
-
- /*
- * because this is an RCU protected lookup, we could find a
- * recently freed or even reallocated inode during the lookup.
- * We need to check under the i_flags_lock for a valid inode
- * here. Skip it if it is not valid or the wrong inode.
- */
- spin_lock(&cip->i_flags_lock);
- if (!cip->i_ino ||
- __xfs_iflags_test(cip, XFS_ISTALE)) {
- spin_unlock(&cip->i_flags_lock);
- continue;
- }
-
- /*
- * Once we fall off the end of the cluster, no point checking
- * any more inodes in the list because they will also all be
- * outside the cluster.
- */
- if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
- spin_unlock(&cip->i_flags_lock);
- break;
- }
- spin_unlock(&cip->i_flags_lock);
-
- /*
- * Do an un-protected check to see if the inode is dirty and
- * is a candidate for flushing. These checks will be repeated
- * later after the appropriate locks are acquired.
- */
- if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
- continue;
-
- /*
- * Try to get locks. If any are unavailable or it is pinned,
- * then this inode cannot be flushed and is skipped.
- */
-
- if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
- continue;
- if (!xfs_iflock_nowait(cip)) {
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
- if (xfs_ipincount(cip)) {
- xfs_ifunlock(cip);
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
-
-
- /*
- * Check the inode number again, just to be certain we are not
- * racing with freeing in xfs_reclaim_inode(). See the comments
- * in that function for more information as to why the initial
- * check is not sufficient.
- */
- if (!cip->i_ino) {
- xfs_ifunlock(cip);
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
-
- /*
- * arriving here means that this inode can be flushed. First
- * re-check that it's dirty before flushing.
- */
- if (!xfs_inode_clean(cip)) {
- int error;
- error = xfs_iflush_int(cip, bp);
- if (error) {
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- goto cluster_corrupt_out;
- }
- clcount++;
- } else {
- xfs_ifunlock(cip);
- }
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- }
-
- if (clcount) {
- XFS_STATS_INC(mp, xs_icluster_flushcnt);
- XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
- }
-
-out_free:
- rcu_read_unlock();
- kmem_free(cilist);
-out_put:
- xfs_perag_put(pag);
- return 0;
-
-
-cluster_corrupt_out:
- /*
- * Corruption detected in the clustering loop. Invalidate the
- * inode buffer and shut down the filesystem.
- */
- rcu_read_unlock();
-
- /*
- * We'll always have an inode attached to the buffer for completion
- * process by the time we are called from xfs_iflush(). Hence we have
- * always need to do IO completion processing to abort the inodes
- * attached to the buffer. handle them just like the shutdown case in
- * xfs_buf_submit().
- */
- ASSERT(bp->b_iodone);
- bp->b_flags |= XBF_ASYNC;
- bp->b_flags &= ~XBF_DONE;
- xfs_buf_stale(bp);
- xfs_buf_ioerror(bp, -EIO);
- xfs_buf_ioend(bp);
-
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-
- /* abort the corrupt inode, as it was not attached to the buffer */
- xfs_iflush_abort(cip, false);
- kmem_free(cilist);
- xfs_perag_put(pag);
- return -EFSCORRUPTED;
-}
-
-/*
- * Flush dirty inode metadata into the backing buffer.
- *
- * The caller must have the inode lock and the inode flush lock held. The
- * inode lock will still be held upon return to the caller, and the inode
- * flush lock will be released after the inode has reached the disk.
- *
- * The caller must write out the buffer returned in *bpp and release it.
- */
-int
+static int
xfs_iflush(
struct xfs_inode *ip,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_buf *bp = NULL;
- struct xfs_dinode *dip;
- int error;
-
- XFS_STATS_INC(mp, xs_iflush_count);
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(xfs_isiflocked(ip));
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
-
- *bpp = NULL;
-
- xfs_iunpin_wait(ip);
-
- /*
- * For stale inodes we cannot rely on the backing buffer remaining
- * stale in cache for the remaining life of the stale inode and so
- * xfs_imap_to_bp() below may give us a buffer that no longer contains
- * inodes below. We have to check this after ensuring the inode is
- * unpinned so that it is safe to reclaim the stale inode after the
- * flush call.
- */
- if (xfs_iflags_test(ip, XFS_ISTALE)) {
- xfs_ifunlock(ip);
- return 0;
- }
-
- /*
- * This may have been unpinned because the filesystem is shutting
- * down forcibly. If that's the case we must not write this inode
- * to disk, because the log record didn't make it to disk.
- *
- * We also have to remove the log item from the AIL in this case,
- * as we wait for an empty AIL as part of the unmount process.
- */
- if (XFS_FORCED_SHUTDOWN(mp)) {
- error = -EIO;
- goto abort_out;
- }
-
- /*
- * Get the buffer containing the on-disk inode. We are doing a try-lock
- * operation here, so we may get an EAGAIN error. In that case, we
- * simply want to return with the inode still dirty.
- *
- * If we get any other error, we effectively have a corruption situation
- * and we cannot flush the inode, so we treat it the same as failing
- * xfs_iflush_int().
- */
- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
- 0);
- if (error == -EAGAIN) {
- xfs_ifunlock(ip);
- return error;
- }
- if (error)
- goto corrupt_out;
-
- /*
- * First flush out the inode that xfs_iflush was called with.
- */
- error = xfs_iflush_int(ip, bp);
- if (error)
- goto corrupt_out;
-
- /*
- * If the buffer is pinned then push on the log now so we won't
- * get stuck waiting in the write for too long.
- */
- if (xfs_buf_ispinned(bp))
- xfs_log_force(mp, 0);
-
- /*
- * inode clustering: try to gather other inodes into this write
- *
- * Note: Any error during clustering will result in the filesystem
- * being shut down and completion callbacks run on the cluster buffer.
- * As we have already flushed and attached this inode to the buffer,
- * it has already been aborted and released by xfs_iflush_cluster() and
- * so we have no further error handling to do here.
- */
- error = xfs_iflush_cluster(ip, bp);
- if (error)
- return error;
-
- *bpp = bp;
- return 0;
-
-corrupt_out:
- if (bp)
- xfs_buf_relse(bp);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-abort_out:
- /* abort the corrupt inode, as it was not attached to the buffer */
- xfs_iflush_abort(ip, false);
- return error;
-}
-
-/*
- * If there are inline format data / attr forks attached to this inode,
- * make sure they're not corrupt.
- */
-bool
-xfs_inode_verify_forks(
- struct xfs_inode *ip)
-{
- struct xfs_ifork *ifp;
- xfs_failaddr_t fa;
-
- fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops);
- if (fa) {
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
- ifp->if_u1.if_data, ifp->if_bytes, fa);
- return false;
- }
-
- fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops);
- if (fa) {
- ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
- ifp ? ifp->if_u1.if_data : NULL,
- ifp ? ifp->if_bytes : 0, fa);
- return false;
- }
- return true;
-}
-
-STATIC int
-xfs_iflush_int(
- struct xfs_inode *ip,
struct xfs_buf *bp)
{
struct xfs_inode_log_item *iip = ip->i_itemp;
struct xfs_dinode *dip;
struct xfs_mount *mp = ip->i_mount;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(xfs_isiflocked(ip));
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
- ASSERT(iip != NULL && iip->ili_fields != 0);
- ASSERT(ip->i_d.di_version > 1);
+ ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
+ ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
+ ASSERT(iip->ili_item.li_buf == bp);
- /* set *dip = inode's place in the buffer */
dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
+ /*
+ * We don't flush the inode if any of the following checks fail, but we
+ * do still update the log item and attach to the backing buffer as if
+ * the flush happened. This is a formality to facilitate predictable
+ * error handling as the caller will shutdown and fail the buffer.
+ */
+ error = -EFSCORRUPTED;
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
- goto corrupt_out;
+ goto flush_out;
}
if (S_ISREG(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
- (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
+ ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
mp, XFS_ERRTAG_IFLUSH_3)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad regular inode %Lu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
- goto corrupt_out;
+ goto flush_out;
}
} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
- (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
+ ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
+ ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
mp, XFS_ERRTAG_IFLUSH_4)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad directory inode %Lu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
- goto corrupt_out;
+ goto flush_out;
}
}
- if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
+ if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: detected corrupt incore inode %Lu, "
"total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
__func__, ip->i_ino,
- ip->i_d.di_nextents + ip->i_d.di_anextents,
+ ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
ip->i_d.di_nblocks, ip);
- goto corrupt_out;
+ goto flush_out;
}
if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
mp, XFS_ERRTAG_IFLUSH_6)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
- goto corrupt_out;
+ goto flush_out;
}
/*
@@ -3868,12 +3513,19 @@ xfs_iflush_int(
* backwards compatibility with old kernels that predate logging all
* inode changes.
*/
- if (ip->i_d.di_version < 3)
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb))
ip->i_d.di_flushiter++;
- /* Check the inline fork data before we write out. */
- if (!xfs_inode_verify_forks(ip))
- goto corrupt_out;
+ /*
+ * If there are inline format data / attr forks attached to this inode,
+ * make sure they are not corrupt.
+ */
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
+ xfs_ifork_verify_local_data(ip))
+ goto flush_out;
+ if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL &&
+ xfs_ifork_verify_local_attr(ip))
+ goto flush_out;
/*
* Copy the dirty parts of the inode into the on-disk inode. We always
@@ -3889,7 +3541,6 @@ xfs_iflush_int(
xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
if (XFS_IFORK_Q(ip))
xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
- xfs_inobp_check(mp, bp);
/*
* We've recorded everything logged in the inode, so we'd like to clear
@@ -3906,41 +3557,148 @@ xfs_iflush_int(
* know that the information those bits represent is permanently on
* disk. As long as the flush completes before the inode is logged
* again, then both ili_fields and ili_last_fields will be cleared.
- *
- * We can play with the ili_fields bits here, because the inode lock
- * must be held exclusively in order to set bits there and the flush
- * lock protects the ili_last_fields bits. Set ili_logged so the flush
- * done routine can tell whether or not to look in the AIL. Also, store
- * the current LSN of the inode so that we can tell whether the item has
- * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
- * need the AIL lock, because it is a 64 bit value that cannot be read
- * atomically.
*/
+ error = 0;
+flush_out:
+ spin_lock(&iip->ili_lock);
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
- iip->ili_logged = 1;
+ spin_unlock(&iip->ili_lock);
+ /*
+ * Store the current LSN of the inode so that we can tell whether the
+ * item has moved in the AIL from xfs_iflush_done().
+ */
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
+ /* generate the checksum. */
+ xfs_dinode_calc_crc(mp, dip);
+ return error;
+}
+
+/*
+ * Non-blocking flush of dirty inode metadata into the backing buffer.
+ *
+ * The caller must have a reference to the inode and hold the cluster buffer
+ * locked. The function will walk across all the inodes on the cluster buffer it
+ * can find and lock without blocking, and flush them to the cluster buffer.
+ *
+ * On successful flushing of at least one inode, the caller must write out the
+ * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
+ * the caller needs to release the buffer. On failure, the filesystem will be
+ * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
+ * will be returned.
+ */
+int
+xfs_iflush_cluster(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_log_item *lip, *n;
+ struct xfs_inode *ip;
+ struct xfs_inode_log_item *iip;
+ int clcount = 0;
+ int error = 0;
+
/*
- * Attach the function xfs_iflush_done to the inode's
- * buffer. This will remove the inode from the AIL
- * and unlock the inode's flush lock when the inode is
- * completely written to disk.
+ * We must use the safe variant here as on shutdown xfs_iflush_abort()
+ * can remove itself from the list.
*/
- xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+ iip = (struct xfs_inode_log_item *)lip;
+ ip = iip->ili_inode;
- /* generate the checksum. */
- xfs_dinode_calc_crc(mp, dip);
+ /*
+ * Quick and dirty check to avoid locks if possible.
+ */
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+ continue;
+ if (xfs_ipincount(ip))
+ continue;
+
+ /*
+ * The inode is still attached to the buffer, which means it is
+ * dirty but reclaim might try to grab it. Check carefully for
+ * that, and grab the ilock while still holding the i_flags_lock
+ * to guarantee reclaim will not be able to reclaim this inode
+ * once we drop the i_flags_lock.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+
+ /*
+ * ILOCK will pin the inode against reclaim and prevent
+ * concurrent transactions modifying the inode while we are
+ * flushing the inode.
+ */
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+ spin_unlock(&ip->i_flags_lock);
- ASSERT(!list_empty(&bp->b_li_list));
- ASSERT(bp->b_iodone != NULL);
+ /*
+ * Skip inodes that are already flush locked as they have
+ * already been written to the buffer.
+ */
+ if (!xfs_iflock_nowait(ip)) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ continue;
+ }
+
+ /*
+ * Abort flushing this inode if we are shut down because the
+ * inode may not currently be in the AIL. This can occur when
+ * log I/O failure unpins the inode without inserting into the
+ * AIL, leaving a dirty/unpinned inode attached to the buffer
+ * that otherwise looks like it should be flushed.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_iunpin_wait(ip);
+ /* xfs_iflush_abort() drops the flush lock */
+ xfs_iflush_abort(ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ error = -EIO;
+ continue;
+ }
+
+ /* don't block waiting on a log force to unpin dirty inodes */
+ if (xfs_ipincount(ip)) {
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ continue;
+ }
+
+ if (!xfs_inode_clean(ip))
+ error = xfs_iflush(ip, bp);
+ else
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (error)
+ break;
+ clcount++;
+ }
+
+ if (error) {
+ bp->b_flags |= XBF_ASYNC;
+ xfs_buf_ioend_fail(bp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
+ if (!clcount)
+ return -EAGAIN;
+
+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
return 0;
-corrupt_out:
- return -EFSCORRUPTED;
}
/* Release an inode. */
@@ -3951,3 +3709,115 @@ xfs_irele(
trace_xfs_irele(ip, _RET_IP_);
iput(VFS_I(ip));
}
+
+/*
+ * Ensure all commited transactions touching the inode are written to the log.
+ */
+int
+xfs_log_force_inode(
+ struct xfs_inode *ip)
+{
+ xfs_lsn_t lsn = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!lsn)
+ return 0;
+ return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
+}
+
+/*
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
+ * layout leases before proceeding. The loop is needed because we cannot call
+ * the blocking break_layout() with the iolocks held, and therefore have to
+ * back out both locks.
+ */
+static int
+xfs_iolock_two_inodes_and_break_layout(
+ struct inode *src,
+ struct inode *dest)
+{
+ int error;
+
+ if (src > dest)
+ swap(src, dest);
+
+retry:
+ /* Wait to break both inodes' layouts before we start locking. */
+ error = break_layout(src, true);
+ if (error)
+ return error;
+ if (src != dest) {
+ error = break_layout(dest, true);
+ if (error)
+ return error;
+ }
+
+ /* Lock one inode and make sure nobody got in and leased it. */
+ inode_lock(src);
+ error = break_layout(src, false);
+ if (error) {
+ inode_unlock(src);
+ if (error == -EWOULDBLOCK)
+ goto retry;
+ return error;
+ }
+
+ if (src == dest)
+ return 0;
+
+ /* Lock the other inode and make sure nobody got in and leased it. */
+ inode_lock_nested(dest, I_MUTEX_NONDIR2);
+ error = break_layout(dest, false);
+ if (error) {
+ inode_unlock(src);
+ inode_unlock(dest);
+ if (error == -EWOULDBLOCK)
+ goto retry;
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
+ * mmap activity.
+ */
+int
+xfs_ilock2_io_mmap(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ int ret;
+
+ ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
+ if (ret)
+ return ret;
+ if (ip1 == ip2)
+ xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
+ else
+ xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
+ ip2, XFS_MMAPLOCK_EXCL);
+ return 0;
+}
+
+/* Unlock both inodes to allow IO and mmap activity. */
+void
+xfs_iunlock2_io_mmap(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ bool same_inode = (ip1 == ip2);
+
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+ if (!same_inode)
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ inode_unlock(VFS_I(ip2));
+ if (!same_inode)
+ inode_unlock(VFS_I(ip1));
+}