diff options
author | Jiri Kosina <jkosina@suse.cz> | 2020-09-01 14:19:48 +0200 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2020-09-01 14:19:48 +0200 |
commit | ead5d1f4d877e92c051e1a1ade623d0d30e71619 (patch) | |
tree | cb9db5698a546e7b96f7d5bef5ce544629dd37a2 /fs/xfs/xfs_inode.c | |
parent | scif: Fix spelling of EACCES (diff) | |
parent | Merge tag 'docs-5.9-3' of git://git.lwn.net/linux (diff) | |
download | linux-ead5d1f4d877e92c051e1a1ade623d0d30e71619.tar.xz linux-ead5d1f4d877e92c051e1a1ade623d0d30e71619.zip |
Merge branch 'master' into for-next
Sync with Linus' branch in order to be able to apply fixups
of more recent patches.
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r-- | fs/xfs/xfs_inode.c | 1062 |
1 files changed, 466 insertions, 596 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c5077e6326c7..c06129cffba9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -44,7 +44,6 @@ kmem_zone_t *xfs_inode_zone; */ #define XFS_ITRUNC_MAX_EXTENTS 2 -STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *); STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); @@ -112,7 +111,7 @@ xfs_ilock_data_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE && (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -125,7 +124,8 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + if (ip->i_afp && + ip->i_afp->if_format == XFS_DINODE_FMT_BTREE && (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -144,17 +144,17 @@ xfs_ilock_attr_map_shared( * * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock * - * mmap_sem locking order: + * mmap_lock locking order: * - * i_rwsem -> page lock -> mmap_sem - * mmap_sem -> i_mmap_lock -> page_lock + * i_rwsem -> page lock -> mmap_lock + * mmap_lock -> i_mmap_lock -> page_lock * - * The difference in mmap_sem locking order mean that we cannot hold the + * The difference in mmap_lock locking order mean that we cannot hold the * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can - * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * fault in pages during copy in/out (for buffered IO) or require the mmap_lock * in get_user_pages() to map the user pages into the kernel address space for * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because - * page faults already hold the mmap_sem. + * page faults already hold the mmap_lock. * * Hence to serialise fully against both syscall and mmap based IO, we need to * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both @@ -451,7 +451,7 @@ xfs_lock_inodes( /* * Currently supports between 2 and 5 inodes with exclusive locking. We * support an arbitrary depth of locking here, but absolute limits on - * inodes depend on the the type of locking and the limits placed by + * inodes depend on the type of locking and the limits placed by * lockdep annotations in xfs_lock_inumorder. These are all checked by * the asserts. */ @@ -801,26 +801,18 @@ xfs_ialloc( return error; ASSERT(ip != NULL); inode = VFS_I(ip); - - /* - * We always convert v1 inodes to v2 now - we only support filesystems - * with >= v2 inode capability, so there is no reason for ever leaving - * an inode in v1 format. - */ - if (ip->i_d.di_version == 1) - ip->i_d.di_version = 2; - inode->i_mode = mode; set_nlink(inode, nlink); - ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); - ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); + inode->i_uid = current_fsuid(); inode->i_rdev = rdev; ip->i_d.di_projid = prid; if (pip && XFS_INHERIT_GID(pip)) { - ip->i_d.di_gid = pip->i_d.di_gid; + inode->i_gid = VFS_I(pip)->i_gid; if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) inode->i_mode |= S_ISGID; + } else { + inode->i_gid = current_fsgid(); } /* @@ -828,13 +820,12 @@ xfs_ialloc( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if ((irix_sgid_inherit) && - (inode->i_mode & S_ISGID) && - (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) + if (irix_sgid_inherit && + (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid)) inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; - ip->i_d.di_nextents = 0; + ip->i_df.if_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); tv = current_time(inode); @@ -847,21 +838,20 @@ xfs_ialloc( ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime = tv; } - flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: case S_IFCHR: case S_IFBLK: case S_IFSOCK: - ip->i_d.di_format = XFS_DINODE_FMT_DEV; + ip->i_df.if_format = XFS_DINODE_FMT_DEV; ip->i_df.if_flags = 0; flags |= XFS_ILOG_DEV; break; @@ -907,24 +897,17 @@ xfs_ialloc( ip->i_d.di_flags |= di_flags; } - if (pip && - (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && - pip->i_d.di_version == 3 && - ip->i_d.di_version == 3) { - uint64_t di_flags2 = 0; - + if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) { if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; } if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; - - ip->i_d.di_flags2 |= di_flags2; + ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; } /* FALLTHROUGH */ case S_IFLNK: - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = 0; ip->i_df.if_u1.if_root = NULL; @@ -932,11 +915,6 @@ xfs_ialloc( default: ASSERT(0); } - /* - * Attribute fork settings for new inode. - */ - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_anextents = 0; /* * Log the new values stuffed into the inode. @@ -1122,7 +1100,6 @@ xfs_bumplink( { xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - ASSERT(ip->i_d.di_version > 1); inc_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -1158,8 +1135,7 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1219,8 +1195,7 @@ xfs_create( unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, - resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + resblks - XFS_IALLOC_SPACE_RES(mp)); if (error) { ASSERT(error != -ENOSPC); goto out_trans_cancel; @@ -1309,8 +1284,7 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1655,7 +1629,7 @@ xfs_release( return 0; /* * If we can't get the iolock just skip truncating the blocks - * past EOF because we could deadlock with the mmap_sem + * past EOF because we could deadlock with the mmap_lock * otherwise. We'll get another chance to drop them once the * last reference to the inode is dropped, so we'll never leak * blocks permanently. @@ -1707,7 +1681,7 @@ xfs_inactive_truncate( if (error) goto error_trans_cancel; - ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_df.if_nextents == 0); error = xfs_trans_commit(tp); if (error) @@ -1765,10 +1739,31 @@ xfs_inactive_ifree( return error; } + /* + * We do not hold the inode locked across the entire rolling transaction + * here. We only need to hold it for the first transaction that + * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the + * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode + * here breaks the relationship between cluster buffer invalidation and + * stale inode invalidation on cluster buffer item journal commit + * completion, and can result in leaving dirty stale inodes hanging + * around in memory. + * + * We have no need for serialising this inode operation against other + * operations - we freed the inode and hence reallocation is required + * and that will serialise on reallocating the space the deferops need + * to free. Hence we can unlock the inode on the first commit of + * the transaction rather than roll it right through the deferops. This + * avoids relogging the XFS_ISTALE inode. + * + * We check that xfs_ifree() hasn't grown an internal transaction roll + * by asserting that the inode is still locked when it returns. + */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_ifree(tp, ip); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -1781,7 +1776,6 @@ xfs_inactive_ifree( xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1799,7 +1793,6 @@ xfs_inactive_ifree( xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } @@ -1857,7 +1850,7 @@ xfs_inactive( if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; error = xfs_qm_dqattach(ip); @@ -1883,7 +1876,6 @@ xfs_inactive( } ASSERT(!ip->i_afp); - ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_d.di_forkoff == 0); /* @@ -2119,7 +2111,7 @@ xfs_iunlink_update_bucket( unsigned int bucket_index, xfs_agino_t new_agino) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + struct xfs_agi *agi = agibp->b_addr; xfs_agino_t old_value; int offset; @@ -2135,7 +2127,7 @@ xfs_iunlink_update_bucket( * head of the list. */ if (old_value == new_agino) { - xfs_buf_corruption_error(agibp); + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; } @@ -2173,7 +2165,6 @@ xfs_iunlink_update_dinode( xfs_dinode_calc_crc(mp, dip); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); - xfs_inobp_check(mp, ibp); } /* Set an in-core inode's unlinked pointer and return the old value. */ @@ -2193,7 +2184,7 @@ xfs_iunlink_update_inode( ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0); if (error) return error; @@ -2259,7 +2250,7 @@ xfs_iunlink( error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* * Get the index into the agi hash table for the list this inode will @@ -2269,12 +2260,11 @@ xfs_iunlink( next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || !xfs_verify_agino_or_null(mp, agno, next_agino)) { - xfs_buf_corruption_error(agibp); + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; } if (next_agino != NULLAGINO) { - struct xfs_perag *pag; xfs_agino_t old_agino; /* @@ -2291,9 +2281,7 @@ xfs_iunlink( * agino has been unlinked, add a backref from the next inode * back to agino. */ - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_add_backref(pag, agino, next_agino); - xfs_perag_put(pag); + error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); if (error) return error; } @@ -2323,7 +2311,7 @@ xfs_iunlink_map_ino( return error; } - error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0); if (error) { xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", __func__, error); @@ -2429,7 +2417,6 @@ xfs_iunlink_remove( struct xfs_buf *agibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; - struct xfs_perag *pag = NULL; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; @@ -2443,7 +2430,7 @@ xfs_iunlink_remove( error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* * Get the index into the agi hash table for the list this inode will @@ -2473,32 +2460,22 @@ xfs_iunlink_remove( * this inode's backref to point from the next inode. */ if (next_agino != NULLAGINO) { - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_change_backref(pag, next_agino, + error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, NULLAGINO); if (error) - goto out; + return error; } - if (head_agino == agino) { - /* Point the head of the list to the next unlinked inode. */ - error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, - next_agino); - if (error) - goto out; - } else { + if (head_agino != agino) { struct xfs_imap imap; xfs_agino_t prev_agino; - if (!pag) - pag = xfs_perag_get(mp, agno); - /* We need to search the list for the inode being freed. */ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, &prev_agino, &imap, &last_dip, &last_ibp, - pag); + agibp->b_pag); if (error) - goto out; + return error; /* Point the previous inode on the list to the next inode. */ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, @@ -2512,15 +2489,106 @@ xfs_iunlink_remove( * change_backref takes care of deleting the backref if * next_agino is NULLAGINO. */ - error = xfs_iunlink_change_backref(pag, agino, next_agino); - if (error) - goto out; + return xfs_iunlink_change_backref(agibp->b_pag, agino, + next_agino); } -out: - if (pag) - xfs_perag_put(pag); - return error; + /* Point the head of the list to the next unlinked inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); +} + +/* + * Look up the inode number specified and if it is not already marked XFS_ISTALE + * mark it stale. We should only find clean inodes in this lookup that aren't + * already stale. + */ +static void +xfs_ifree_mark_inode_stale( + struct xfs_buf *bp, + struct xfs_inode *free_ip, + xfs_ino_t inum) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_perag *pag = bp->b_pag; + struct xfs_inode_log_item *iip; + struct xfs_inode *ip; + +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); + + /* Inode not in memory, nothing to do */ + if (!ip) { + rcu_read_unlock(); + return; + } + + /* + * because this is an RCU protected lookup, we could find a recently + * freed or even reallocated inode during the lookup. We need to check + * under the i_flags_lock for a valid inode here. Skip it if it is not + * valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return; + } + + /* + * Don't try to lock/unlock the current inode, but we _cannot_ skip the + * other inodes that we did not find in the list attached to the buffer + * and are not already marked stale. If we can't lock it, back off and + * retry. + */ + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + delay(1); + goto retry; + } + } + ip->i_flags |= XFS_ISTALE; + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + /* + * If we can't get the flush lock, the inode is already attached. All + * we needed to do here is mark the inode stale so buffer IO completion + * will remove it from the AIL. + */ + iip = ip->i_itemp; + if (!xfs_iflock_nowait(ip)) { + ASSERT(!list_empty(&iip->ili_item.li_bio_list)); + ASSERT(iip->ili_last_fields); + goto out_iunlock; + } + + /* + * Inodes not attached to the buffer can be released immediately. + * Everything else has to go through xfs_iflush_abort() on journal + * commit as the flock synchronises removal of the inode from the + * cluster buffer against inode reclaim. + */ + if (!iip || list_empty(&iip->ili_item.li_bio_list)) { + xfs_ifunlock(ip); + goto out_iunlock; + } + + /* we have a dirty inode in memory that has not yet been flushed. */ + spin_lock(&iip->ili_lock); + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); + ASSERT(iip->ili_last_fields); + +out_iunlock: + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); } /* @@ -2530,26 +2598,20 @@ out: */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, + struct xfs_inode *free_ip, + struct xfs_trans *tp, struct xfs_icluster *xic) { - xfs_mount_t *mp = free_ip->i_mount; + struct xfs_mount *mp = free_ip->i_mount; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + struct xfs_buf *bp; + xfs_daddr_t blkno; + xfs_ino_t inum = xic->first_ino; int nbufs; int i, j; int ioffset; - xfs_daddr_t blkno; - xfs_buf_t *bp; - xfs_inode_t *ip; - xfs_inode_log_item_t *iip; - struct xfs_log_item *lip; - struct xfs_perag *pag; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_ino_t inum; int error; - inum = xic->first_ino; - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { @@ -2593,148 +2655,20 @@ xfs_ifree_cluster( bp->b_ops = &xfs_inode_buf_ops; /* - * Walk the inodes already attached to the buffer and mark them - * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. By marking them all - * stale first, we will not attempt to lock them in the loop - * below as the XFS_ISTALE flag will be set. + * Now we need to set all the cached clean inodes as XFS_ISTALE, + * too. This requires lookups, and will skip inodes that we've + * already marked XFS_ISTALE. */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; - ASSERT(iip->ili_logged == 1); - lip->li_cb = xfs_istale_done; - xfs_trans_ail_copy_lsn(mp->m_ail, - &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - } - } - - - /* - * For each inode in memory attempt to add it to the inode - * buffer and set it up for being staled on buffer IO - * completion. This is safe as we've locked out tail pushing - * and flushing by locking the buffer. - * - * We have already marked every inode that was part of a - * transaction stale above, which means there is no point in - * even trying to lock them. - */ - for (i = 0; i < igeo->inodes_per_cluster; i++) { -retry: - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, (inum + i))); - - /* Inode not in memory, nothing to do */ - if (!ip) { - rcu_read_unlock(); - continue; - } - - /* - * because this is an RCU protected lookup, we could - * find a recently freed or even reallocated inode - * during the lookup. We need to check under the - * i_flags_lock for a valid inode here. Skip it if it - * is not valid, the wrong inode or stale. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum + i || - __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - continue; - } - spin_unlock(&ip->i_flags_lock); - - /* - * Don't try to lock/unlock the current inode, but we - * _cannot_ skip the other inodes that we did not find - * in the list attached to the buffer and are not - * already marked stale. If we can't lock it, back off - * and retry. - */ - if (ip != free_ip) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; - } - - /* - * Check the inode number again in case we're - * racing with freeing in xfs_reclaim_inode(). - * See the comments in that function for more - * information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum + i) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - rcu_read_unlock(); - continue; - } - } - rcu_read_unlock(); - - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); - - /* - * we don't need to attach clean inodes or those only - * with unlogged changes (which we throw away, anyway). - */ - iip = ip->i_itemp; - if (!iip || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_logged = 1; - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - - xfs_buf_attach_iodone(bp, xfs_istale_done, - &iip->ili_item); - - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + for (i = 0; i < igeo->inodes_per_cluster; i++) + xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } - - xfs_perag_put(pag); return 0; } /* - * Free any local-format buffers sitting around before we reset to - * extents format. - */ -static inline void -xfs_ifree_local_data( - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_ifork *ifp; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return; - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); -} - -/* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have * no pages associated with it. This routine also assumes that @@ -2751,11 +2685,11 @@ xfs_ifree( { int error; struct xfs_icluster xic = { 0 }; + struct xfs_inode_log_item *iip = ip->i_itemp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(ip->i_d.di_nextents == 0); - ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_df.if_nextents == 0); ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); ASSERT(ip->i_d.di_nblocks == 0); @@ -2770,19 +2704,28 @@ xfs_ifree( if (error) return error; - xfs_ifree_local_data(ip, XFS_DATA_FORK); - xfs_ifree_local_data(ip, XFS_ATTR_FORK); + /* + * Free any local-format data sitting around before we reset the + * data fork to extents format. Note that the attr fork data has + * already been freed by xfs_attr_inactive. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + kmem_free(ip->i_df.if_u1.if_data); + ip->i_df.if_u1.if_data = NULL; + ip->i_df.if_bytes = 0; + } VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_flags2 = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* Don't attempt to replay owner changes for a deleted inode */ - ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); + spin_lock(&iip->ili_lock); + iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); + spin_unlock(&iip->ili_lock); /* * Bump the generation count so no one will be confused @@ -3162,7 +3105,7 @@ out_trans_abort: /* * xfs_rename_alloc_whiteout() * - * Return a referenced, unlinked, unlocked inode that that can be used as a + * Return a referenced, unlinked, unlocked inode that can be used as a * whiteout in a rename transaction. We use a tmpfile inode here so that if we * crash between allocating the inode and linking it into the rename transaction * recovery will free the inode and we won't leak it. @@ -3489,374 +3432,76 @@ out_release_wip: return error; } -STATIC int -xfs_iflush_cluster( - struct xfs_inode *ip, - struct xfs_buf *bp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - unsigned long first_index, mask; - int cilist_size; - struct xfs_inode **cilist; - struct xfs_inode *cip; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - int nr_found; - int clcount = 0; - int i; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); - cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); - if (!cilist) - goto out_put; - - mask = ~(igeo->inodes_per_cluster - 1); - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - rcu_read_lock(); - /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, - first_index, igeo->inodes_per_cluster); - if (nr_found == 0) - goto out_free; - - for (i = 0; i < nr_found; i++) { - cip = cilist[i]; - if (cip == ip) - continue; - - /* - * because this is an RCU protected lookup, we could find a - * recently freed or even reallocated inode during the lookup. - * We need to check under the i_flags_lock for a valid inode - * here. Skip it if it is not valid or the wrong inode. - */ - spin_lock(&cip->i_flags_lock); - if (!cip->i_ino || - __xfs_iflags_test(cip, XFS_ISTALE)) { - spin_unlock(&cip->i_flags_lock); - continue; - } - - /* - * Once we fall off the end of the cluster, no point checking - * any more inodes in the list because they will also all be - * outside the cluster. - */ - if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { - spin_unlock(&cip->i_flags_lock); - break; - } - spin_unlock(&cip->i_flags_lock); - - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) - continue; - - /* - * Try to get locks. If any are unavailable or it is pinned, - * then this inode cannot be flushed and is skipped. - */ - - if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) - continue; - if (!xfs_iflock_nowait(cip)) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - if (xfs_ipincount(cip)) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - - /* - * Check the inode number again, just to be certain we are not - * racing with freeing in xfs_reclaim_inode(). See the comments - * in that function for more information as to why the initial - * check is not sufficient. - */ - if (!cip->i_ino) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - /* - * arriving here means that this inode can be flushed. First - * re-check that it's dirty before flushing. - */ - if (!xfs_inode_clean(cip)) { - int error; - error = xfs_iflush_int(cip, bp); - if (error) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto cluster_corrupt_out; - } - clcount++; - } else { - xfs_ifunlock(cip); - } - xfs_iunlock(cip, XFS_ILOCK_SHARED); - } - - if (clcount) { - XFS_STATS_INC(mp, xs_icluster_flushcnt); - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); - } - -out_free: - rcu_read_unlock(); - kmem_free(cilist); -out_put: - xfs_perag_put(pag); - return 0; - - -cluster_corrupt_out: - /* - * Corruption detected in the clustering loop. Invalidate the - * inode buffer and shut down the filesystem. - */ - rcu_read_unlock(); - - /* - * We'll always have an inode attached to the buffer for completion - * process by the time we are called from xfs_iflush(). Hence we have - * always need to do IO completion processing to abort the inodes - * attached to the buffer. handle them just like the shutdown case in - * xfs_buf_submit(). - */ - ASSERT(bp->b_iodone); - bp->b_flags |= XBF_ASYNC; - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp); - - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(cip, false); - kmem_free(cilist); - xfs_perag_put(pag); - return -EFSCORRUPTED; -} - -/* - * Flush dirty inode metadata into the backing buffer. - * - * The caller must have the inode lock and the inode flush lock held. The - * inode lock will still be held upon return to the caller, and the inode - * flush lock will be released after the inode has reached the disk. - * - * The caller must write out the buffer returned in *bpp and release it. - */ -int +static int xfs_iflush( struct xfs_inode *ip, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - int error; - - XFS_STATS_INC(mp, xs_iflush_count); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - *bpp = NULL; - - xfs_iunpin_wait(ip); - - /* - * For stale inodes we cannot rely on the backing buffer remaining - * stale in cache for the remaining life of the stale inode and so - * xfs_imap_to_bp() below may give us a buffer that no longer contains - * inodes below. We have to check this after ensuring the inode is - * unpinned so that it is safe to reclaim the stale inode after the - * flush call. - */ - if (xfs_iflags_test(ip, XFS_ISTALE)) { - xfs_ifunlock(ip); - return 0; - } - - /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk. - * - * We also have to remove the log item from the AIL in this case, - * as we wait for an empty AIL as part of the unmount process. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - goto abort_out; - } - - /* - * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, we - * simply want to return with the inode still dirty. - * - * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode, so we treat it the same as failing - * xfs_iflush_int(). - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, - 0); - if (error == -EAGAIN) { - xfs_ifunlock(ip); - return error; - } - if (error) - goto corrupt_out; - - /* - * First flush out the inode that xfs_iflush was called with. - */ - error = xfs_iflush_int(ip, bp); - if (error) - goto corrupt_out; - - /* - * If the buffer is pinned then push on the log now so we won't - * get stuck waiting in the write for too long. - */ - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - - /* - * inode clustering: try to gather other inodes into this write - * - * Note: Any error during clustering will result in the filesystem - * being shut down and completion callbacks run on the cluster buffer. - * As we have already flushed and attached this inode to the buffer, - * it has already been aborted and released by xfs_iflush_cluster() and - * so we have no further error handling to do here. - */ - error = xfs_iflush_cluster(ip, bp); - if (error) - return error; - - *bpp = bp; - return 0; - -corrupt_out: - if (bp) - xfs_buf_relse(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -abort_out: - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(ip, false); - return error; -} - -/* - * If there are inline format data / attr forks attached to this inode, - * make sure they're not corrupt. - */ -bool -xfs_inode_verify_forks( - struct xfs_inode *ip) -{ - struct xfs_ifork *ifp; - xfs_failaddr_t fa; - - fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", - ifp->if_u1.if_data, ifp->if_bytes, fa); - return false; - } - - fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp ? ifp->if_u1.if_data : NULL, - ifp ? ifp->if_bytes : 0, fa); - return false; - } - return true; -} - -STATIC int -xfs_iflush_int( - struct xfs_inode *ip, struct xfs_buf *bp) { struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - ASSERT(iip != NULL && iip->ili_fields != 0); - ASSERT(ip->i_d.di_version > 1); + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || + ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(iip->ili_item.li_buf == bp); - /* set *dip = inode's place in the buffer */ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); + /* + * We don't flush the inode if any of the following checks fail, but we + * do still update the log item and attach to the backing buffer as if + * the flush happened. This is a formality to facilitate predictable + * error handling as the caller will shutdown and fail the buffer. + */ + error = -EFSCORRUPTED; if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); - goto corrupt_out; + goto flush_out; } if (S_ISREG(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && - (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > + if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, __func__, ip->i_ino, - ip->i_d.di_nextents + ip->i_d.di_anextents, + ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), ip->i_d.di_nblocks, ip); - goto corrupt_out; + goto flush_out; } if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_d.di_forkoff, ip); - goto corrupt_out; + goto flush_out; } /* @@ -3868,12 +3513,19 @@ xfs_iflush_int( * backwards compatibility with old kernels that predate logging all * inode changes. */ - if (ip->i_d.di_version < 3) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) ip->i_d.di_flushiter++; - /* Check the inline fork data before we write out. */ - if (!xfs_inode_verify_forks(ip)) - goto corrupt_out; + /* + * If there are inline format data / attr forks attached to this inode, + * make sure they are not corrupt. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_data(ip)) + goto flush_out; + if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_attr(ip)) + goto flush_out; /* * Copy the dirty parts of the inode into the on-disk inode. We always @@ -3889,7 +3541,6 @@ xfs_iflush_int( xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); - xfs_inobp_check(mp, bp); /* * We've recorded everything logged in the inode, so we'd like to clear @@ -3906,41 +3557,148 @@ xfs_iflush_int( * know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. - * - * We can play with the ili_fields bits here, because the inode lock - * must be held exclusively in order to set bits there and the flush - * lock protects the ili_last_fields bits. Set ili_logged so the flush - * done routine can tell whether or not to look in the AIL. Also, store - * the current LSN of the inode so that we can tell whether the item has - * moved in the AIL from xfs_iflush_done(). In order to read the lsn we - * need the AIL lock, because it is a 64 bit value that cannot be read - * atomically. */ + error = 0; +flush_out: + spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; - iip->ili_logged = 1; + spin_unlock(&iip->ili_lock); + /* + * Store the current LSN of the inode so that we can tell whether the + * item has moved in the AIL from xfs_iflush_done(). + */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); + return error; +} + +/* + * Non-blocking flush of dirty inode metadata into the backing buffer. + * + * The caller must have a reference to the inode and hold the cluster buffer + * locked. The function will walk across all the inodes on the cluster buffer it + * can find and lock without blocking, and flush them to the cluster buffer. + * + * On successful flushing of at least one inode, the caller must write out the + * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and + * the caller needs to release the buffer. On failure, the filesystem will be + * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED + * will be returned. + */ +int +xfs_iflush_cluster( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_log_item *lip, *n; + struct xfs_inode *ip; + struct xfs_inode_log_item *iip; + int clcount = 0; + int error = 0; + /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. + * We must use the safe variant here as on shutdown xfs_iflush_abort() + * can remove itself from the list. */ - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + iip = (struct xfs_inode_log_item *)lip; + ip = iip->ili_inode; - /* generate the checksum. */ - xfs_dinode_calc_crc(mp, dip); + /* + * Quick and dirty check to avoid locks if possible. + */ + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + continue; + if (xfs_ipincount(ip)) + continue; + + /* + * The inode is still attached to the buffer, which means it is + * dirty but reclaim might try to grab it. Check carefully for + * that, and grab the ilock while still holding the i_flags_lock + * to guarantee reclaim will not be able to reclaim this inode + * once we drop the i_flags_lock. + */ + spin_lock(&ip->i_flags_lock); + ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + + /* + * ILOCK will pin the inode against reclaim and prevent + * concurrent transactions modifying the inode while we are + * flushing the inode. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); - ASSERT(!list_empty(&bp->b_li_list)); - ASSERT(bp->b_iodone != NULL); + /* + * Skip inodes that are already flush locked as they have + * already been written to the buffer. + */ + if (!xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + /* + * Abort flushing this inode if we are shut down because the + * inode may not currently be in the AIL. This can occur when + * log I/O failure unpins the inode without inserting into the + * AIL, leaving a dirty/unpinned inode attached to the buffer + * that otherwise looks like it should be flushed. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ + xfs_iflush_abort(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + error = -EIO; + continue; + } + + /* don't block waiting on a log force to unpin dirty inodes */ + if (xfs_ipincount(ip)) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + if (!xfs_inode_clean(ip)) + error = xfs_iflush(ip, bp); + else + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + break; + clcount++; + } + + if (error) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + if (!clcount) + return -EAGAIN; + + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); return 0; -corrupt_out: - return -EFSCORRUPTED; } /* Release an inode. */ @@ -3951,3 +3709,115 @@ xfs_irele( trace_xfs_irele(ip, _RET_IP_); iput(VFS_I(ip)); } + +/* + * Ensure all commited transactions touching the inode are written to the log. + */ +int +xfs_log_force_inode( + struct xfs_inode *ip) +{ + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); +} + +/* + * Grab the exclusive iolock for a data copy from src to dest, making sure to + * abide vfs locking order (lowest pointer value goes first) and breaking the + * layout leases before proceeding. The loop is needed because we cannot call + * the blocking break_layout() with the iolocks held, and therefore have to + * back out both locks. + */ +static int +xfs_iolock_two_inodes_and_break_layout( + struct inode *src, + struct inode *dest) +{ + int error; + + if (src > dest) + swap(src, dest); + +retry: + /* Wait to break both inodes' layouts before we start locking. */ + error = break_layout(src, true); + if (error) + return error; + if (src != dest) { + error = break_layout(dest, true); + if (error) + return error; + } + + /* Lock one inode and make sure nobody got in and leased it. */ + inode_lock(src); + error = break_layout(src, false); + if (error) { + inode_unlock(src); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + if (src == dest) + return 0; + + /* Lock the other inode and make sure nobody got in and leased it. */ + inode_lock_nested(dest, I_MUTEX_NONDIR2); + error = break_layout(dest, false); + if (error) { + inode_unlock(src); + inode_unlock(dest); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + return 0; +} + +/* + * Lock two inodes so that userspace cannot initiate I/O via file syscalls or + * mmap activity. + */ +int +xfs_ilock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int ret; + + ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); + if (ret) + return ret; + if (ip1 == ip2) + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + else + xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, + ip2, XFS_MMAPLOCK_EXCL); + return 0; +} + +/* Unlock both inodes to allow IO and mmap activity. */ +void +xfs_iunlock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + bool same_inode = (ip1 == ip2); + + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + inode_unlock(VFS_I(ip2)); + if (!same_inode) + inode_unlock(VFS_I(ip1)); +} |