diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-21 19:53:05 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-21 19:53:05 +0100 |
commit | 53d2e6976bd4042672ed7b90dfbf4b31635b7dcf (patch) | |
tree | 1d137e044c52433df6b4785fed1c7aafbb9333bc /fs | |
parent | Merge tag 'for-f2fs-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/jae... (diff) | |
parent | Merge branch 'xfs-misc-fixes-4.6-4' into for-next (diff) | |
download | linux-53d2e6976bd4042672ed7b90dfbf4b31635b7dcf.tar.xz linux-53d2e6976bd4042672ed7b90dfbf4b31635b7dcf.zip |
Merge tag 'xfs-for-linus-4.6-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs
Pull xfs updates from Dave Chinner:
"There's quite a lot in this request, and there's some cross-over with
ext4, dax and quota code due to the nature of the changes being made.
As for the rest of the XFS changes, there are lots of little things
all over the place, which add up to a lot of changes in the end.
The major changes are that we've reduced the size of the struct
xfs_inode by ~100 bytes (gives an inode cache footprint reduction of
>10%), the writepage code now only does a single set of mapping tree
lockups so uses less CPU, delayed allocation reservations won't
overrun under random write loads anymore, and we added compile time
verification for on-disk structure sizes so we find out when a commit
or platform/compiler change breaks the on disk structure as early as
possible.
Change summary:
- error propagation for direct IO failures fixes for both XFS and
ext4
- new quota interfaces and XFS implementation for iterating all the
quota IDs in the filesystem
- locking fixes for real-time device extent allocation
- reduction of duplicate information in the xfs and vfs inode, saving
roughly 100 bytes of memory per cached inode.
- buffer flag cleanup
- rework of the writepage code to use the generic write clustering
mechanisms
- several fixes for inode flag based DAX enablement
- rework of remount option parsing
- compile time verification of on-disk format structure sizes
- delayed allocation reservation overrun fixes
- lots of little error handling fixes
- small memory leak fixes
- enable xfsaild freezing again"
* tag 'xfs-for-linus-4.6-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (66 commits)
xfs: always set rvalp in xfs_dir2_node_trim_free
xfs: ensure committed is initialized in xfs_trans_roll
xfs: borrow indirect blocks from freed extent when available
xfs: refactor delalloc indlen reservation split into helper
xfs: update freeblocks counter after extent deletion
xfs: debug mode forced buffered write failure
xfs: remove impossible condition
xfs: check sizes of XFS on-disk structures at compile time
xfs: ioends require logically contiguous file offsets
xfs: use named array initializers for log item dumping
xfs: fix computation of inode btree maxlevels
xfs: reinitialise per-AG structures if geometry changes during recovery
xfs: remove xfs_trans_get_block_res
xfs: fix up inode32/64 (re)mount handling
xfs: fix format specifier , should be %llx and not %llu
xfs: sanitize remount options
xfs: convert mount option parsing to tokens
xfs: fix two memory leaks in xfs_attr_list.c error paths
xfs: XFS_DIFLAG2_DAX limited by PAGE_SIZE
xfs: dynamically switch modes when XFS_DIFLAG2_DAX is set/cleared
...
Diffstat (limited to 'fs')
66 files changed, 2178 insertions, 1590 deletions
@@ -286,8 +286,13 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_unlock(inode); - if ((retval > 0) && end_io) - end_io(iocb, pos, retval, bh.b_private); + if (end_io) { + int err; + + err = end_io(iocb, pos, retval, bh.b_private); + if (err) + retval = err; + } if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(inode); diff --git a/fs/direct-io.c b/fs/direct-io.c index 0a8d937c6775..476f1ecbd1f0 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -253,8 +253,13 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, if (ret == 0) ret = transferred; - if (dio->end_io && dio->result) - dio->end_io(dio->iocb, offset, transferred, dio->private); + if (dio->end_io) { + int err; + + err = dio->end_io(dio->iocb, offset, ret, dio->private); + if (err) + ret = err; + } if (!(dio->flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(dio->inode); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 393689dfa1af..c04743519865 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1511,15 +1511,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } -static inline void ext4_set_io_unwritten_flag(struct inode *inode, - struct ext4_io_end *io_end) -{ - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_unwritten); - } -} - /* * Inode dynamic state flags */ @@ -3293,6 +3284,27 @@ extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; extern int ext4_resize_begin(struct super_block *sb); extern void ext4_resize_end(struct super_block *sb); +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_unwritten); + } +} + +static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); + } +} + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b2e9576450eb..dab84a2530ff 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3289,22 +3289,32 @@ out: } #endif -static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, +static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { ext4_io_end_t *io_end = private; /* if not async direct IO just return */ if (!io_end) - return; + return 0; ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", io_end, io_end->inode->i_ino, iocb, offset, size); + /* + * Error during AIO DIO. We cannot convert unwritten extents as the + * data was not written. Just clear the unwritten flag and drop io_end. + */ + if (size <= 0) { + ext4_clear_io_unwritten_flag(io_end); + size = 0; + } io_end->offset = offset; io_end->size = size; ext4_put_io_end(io_end); + + return 0; } /* diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 349d7aa04fe7..d77d15f4b674 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -136,16 +136,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) kmem_cache_free(io_end_cachep, io_end); } -static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) -{ - struct inode *inode = io_end->inode; - - io_end->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); -} - /* * Check a range of space and convert unwritten extents to written. Note that * we are protected from truncate touching same part of extent tree by the diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index cda0361e95a4..043110e5212d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -620,7 +620,7 @@ bail: * particularly interested in the aio/dio case. We use the rw_lock DLM lock * to protect io on one node from truncation on another. */ -static void ocfs2_dio_end_io(struct kiocb *iocb, +static int ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private) @@ -628,6 +628,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, struct inode *inode = file_inode(iocb->ki_filp); int level; + if (bytes <= 0) + return 0; + /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -644,6 +647,8 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, level = ocfs2_iocb_rw_locked_level(iocb); ocfs2_rw_unlock(inode, level); } + + return 0; } static int ocfs2_releasepage(struct page *page, gfp_t wait) diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 3746367098fd..0ebc90496525 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -79,7 +79,7 @@ unsigned int qtype_enforce_flag(int type) return 0; } -static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, +static int quota_quotaon(struct super_block *sb, int type, qid_t id, struct path *path) { if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable) @@ -222,6 +222,34 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, return 0; } +/* + * Return quota for next active quota >= this id, if any exists, + * otherwise return -ESRCH via ->get_nextdqblk + */ +static int quota_getnextquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct kqid qid; + struct qc_dqblk fdq; + struct if_nextdqblk idq; + int ret; + + if (!sb->s_qcop->get_nextdqblk) + return -ENOSYS; + qid = make_kqid(current_user_ns(), type, id); + if (!qid_valid(qid)) + return -EINVAL; + ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq); + if (ret) + return ret; + /* struct if_nextdqblk is a superset of struct if_dqblk */ + copy_to_if_dqblk((struct if_dqblk *)&idq, &fdq); + idq.dqb_id = from_kqid(current_user_ns(), qid); + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; + return 0; +} + static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src) { dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit); @@ -625,6 +653,34 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, return ret; } +/* + * Return quota for next active quota >= this id, if any exists, + * otherwise return -ESRCH via ->get_nextdqblk. + */ +static int quota_getnextxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + struct qc_dqblk qdq; + struct kqid qid; + qid_t id_out; + int ret; + + if (!sb->s_qcop->get_nextdqblk) + return -ENOSYS; + qid = make_kqid(current_user_ns(), type, id); + if (!qid_valid(qid)) + return -EINVAL; + ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq); + if (ret) + return ret; + id_out = from_kqid(current_user_ns(), qid); + copy_to_xfs_dqblk(&fdq, &qdq, type, id_out); + if (copy_to_user(addr, &fdq, sizeof(fdq))) + return -EFAULT; + return ret; +} + static int quota_rmxquota(struct super_block *sb, void __user *addr) { __u32 flags; @@ -659,7 +715,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, switch (cmd) { case Q_QUOTAON: - return quota_quotaon(sb, type, cmd, id, path); + return quota_quotaon(sb, type, id, path); case Q_QUOTAOFF: return quota_quotaoff(sb, type); case Q_GETFMT: @@ -670,6 +726,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, return quota_setinfo(sb, type, addr); case Q_GETQUOTA: return quota_getquota(sb, type, id, addr); + case Q_GETNEXTQUOTA: + return quota_getnextquota(sb, type, id, addr); case Q_SETQUOTA: return quota_setquota(sb, type, id, addr); case Q_SYNC: @@ -690,6 +748,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, return quota_setxquota(sb, type, id, addr); case Q_XGETQUOTA: return quota_getxquota(sb, type, id, addr); + case Q_XGETNEXTQUOTA: + return quota_getnextxquota(sb, type, id, addr); case Q_XQUOTASYNC: if (sb->s_flags & MS_RDONLY) return -EROFS; @@ -708,10 +768,12 @@ static int quotactl_cmd_write(int cmd) switch (cmd) { case Q_GETFMT: case Q_GETINFO: + case Q_GETNEXTQUOTA: case Q_SYNC: case Q_XGETQSTAT: case Q_XGETQSTATV: case Q_XGETQUOTA: + case Q_XGETNEXTQUOTA: case Q_XQUOTASYNC: return 0; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 444626ddbd1b..d9b42425291e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -118,8 +118,6 @@ xfs_allocbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); - - xfs_trans_binval(cur->bc_tp, bp); return 0; } diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index 919756e3ba53..90928bbe693c 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -24,22 +24,6 @@ * Small attribute lists are packed as tightly as possible so as * to fit into the literal area of the inode. */ - -/* - * Entries are packed toward the top as tight as possible. - */ -typedef struct xfs_attr_shortform { - struct xfs_attr_sf_hdr { /* constant-structure header block */ - __be16 totsize; /* total bytes in shortform list */ - __u8 count; /* count of active entries */ - } hdr; - struct xfs_attr_sf_entry { - __uint8_t namelen; /* actual length of name (no NULL) */ - __uint8_t valuelen; /* actual length of value (no NULL) */ - __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - __uint8_t nameval[1]; /* name & value bytes concatenated */ - } list[1]; /* variable sized array */ -} xfs_attr_shortform_t; typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ef00156f4f96..041b6948aecc 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -477,10 +477,7 @@ xfs_bmap_check_leaf_extents( } block = XFS_BUF_TO_BLOCK(bp); } - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } + return; error0: @@ -912,7 +909,7 @@ xfs_bmap_local_to_extents( * We don't want to deal with the case of keeping inode data inline yet. * So sending the data fork of a regular inode is invalid. */ - ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); + ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK)); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); @@ -1079,7 +1076,7 @@ xfs_bmap_add_attrfork_local( if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; - if (S_ISDIR(ip->i_d.di_mode)) { + if (S_ISDIR(VFS_I(ip)->i_mode)) { memset(&dargs, 0, sizeof(dargs)); dargs.geo = ip->i_mount->m_dir_geo; dargs.dp = ip; @@ -1091,7 +1088,7 @@ xfs_bmap_add_attrfork_local( return xfs_dir2_sf_to_block(&dargs); } - if (S_ISLNK(ip->i_d.di_mode)) + if (S_ISLNK(VFS_I(ip)->i_mode)) return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, XFS_DATA_FORK, xfs_symlink_local_to_remote); @@ -4721,6 +4718,66 @@ error0: } /* + * When a delalloc extent is split (e.g., due to a hole punch), the original + * indlen reservation must be shared across the two new extents that are left + * behind. + * + * Given the original reservation and the worst case indlen for the two new + * extents (as calculated by xfs_bmap_worst_indlen()), split the original + * reservation fairly across the two new extents. If necessary, steal available + * blocks from a deleted extent to make up a reservation deficiency (e.g., if + * ores == 1). The number of stolen blocks is returned. The availability and + * subsequent accounting of stolen blocks is the responsibility of the caller. + */ +static xfs_filblks_t +xfs_bmap_split_indlen( + xfs_filblks_t ores, /* original res. */ + xfs_filblks_t *indlen1, /* ext1 worst indlen */ + xfs_filblks_t *indlen2, /* ext2 worst indlen */ + xfs_filblks_t avail) /* stealable blocks */ +{ + xfs_filblks_t len1 = *indlen1; + xfs_filblks_t len2 = *indlen2; + xfs_filblks_t nres = len1 + len2; /* new total res. */ + xfs_filblks_t stolen = 0; + + /* + * Steal as many blocks as we can to try and satisfy the worst case + * indlen for both new extents. + */ + while (nres > ores && avail) { + nres--; + avail--; + stolen++; + } + + /* + * The only blocks available are those reserved for the original + * extent and what we can steal from the extent being removed. + * If this still isn't enough to satisfy the combined + * requirements for the two new extents, skim blocks off of each + * of the new reservations until they match what is available. + */ + while (nres > ores) { + if (len1) { + len1--; + nres--; + } + if (nres == ores) + break; + if (len2) { + len2--; + nres--; + } + } + + *indlen1 = len1; + *indlen2 = len2; + + return stolen; +} + +/* * Called by xfs_bmapi to update file extent records and the btree * after removing space (or undoing a delayed allocation). */ @@ -4984,28 +5041,29 @@ xfs_bmap_del_extent( XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); } else { + xfs_filblks_t stolen; ASSERT(whichfork == XFS_DATA_FORK); - temp = xfs_bmap_worst_indlen(ip, temp); + + /* + * Distribute the original indlen reservation across the + * two new extents. Steal blocks from the deleted extent + * if necessary. Stealing blocks simply fudges the + * fdblocks accounting in xfs_bunmapi(). + */ + temp = xfs_bmap_worst_indlen(ip, got.br_blockcount); + temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount); + stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2, + del->br_blockcount); + da_new = temp + temp2 - stolen; + del->br_blockcount -= stolen; + + /* + * Set the reservation for each extent. Warn if either + * is zero as this can lead to delalloc problems. + */ + WARN_ON_ONCE(!temp || !temp2); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - temp2 = xfs_bmap_worst_indlen(ip, temp2); new.br_startblock = nullstartblock((int)temp2); - da_new = temp + temp2; - while (da_new > da_old) { - if (temp) { - temp--; - da_new--; - xfs_bmbt_set_startblock(ep, - nullstartblock((int)temp)); - } - if (da_new == da_old) - break; - if (temp2) { - temp2--; - da_new--; - new.br_startblock = - nullstartblock((int)temp2); - } - } } trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_insert(ip, *idx + 1, 1, &new, state); @@ -5210,7 +5268,7 @@ xfs_bunmapi( * This is better than zeroing it. */ ASSERT(del.br_state == XFS_EXT_NORM); - ASSERT(xfs_trans_get_block_res(tp) > 0); + ASSERT(tp->t_blk_res > 0); /* * If this spans a realtime extent boundary, * chop it back to the start of the one we end at. @@ -5241,7 +5299,7 @@ xfs_bunmapi( del.br_startblock += mod; } else if ((del.br_startoff == start && (del.br_state == XFS_EXT_UNWRITTEN || - xfs_trans_get_block_res(tp) == 0)) || + tp->t_blk_res == 0)) || !xfs_sb_version_hasextflgbit(&mp->m_sb)) { /* * Can't make it unwritten. There isn't @@ -5296,9 +5354,37 @@ xfs_bunmapi( goto nodelete; } } + + /* + * If it's the case where the directory code is running + * with no block reservation, and the deleted block is in + * the middle of its extent, and the resulting insert + * of an extent would cause transformation to btree format, + * then reject it. The calling code will then swap + * blocks around instead. + * We have to do this now, rather than waiting for the + * conversion to btree format, since the transaction + * will be dirty. + */ + if (!wasdel && tp->t_blk_res == 0 && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ + XFS_IFORK_MAXEXT(ip, whichfork) && + del.br_startoff > got.br_startoff && + del.br_startoff + del.br_blockcount < + got.br_startoff + got.br_blockcount) { + error = -ENOSPC; + goto error0; + } + + /* + * Unreserve quota and update realtime free space, if + * appropriate. If delayed allocation, update the inode delalloc + * counter now and wait to update the sb counters as + * xfs_bmap_del_extent() might need to borrow some blocks. + */ if (wasdel) { ASSERT(startblockval(del.br_startblock) > 0); - /* Update realtime/data freespace, unreserve quota */ if (isrt) { xfs_filblks_t rtexts; @@ -5309,8 +5395,6 @@ xfs_bunmapi( ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { - xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, - false); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); @@ -5321,32 +5405,16 @@ xfs_bunmapi( XFS_BTCUR_BPRV_WASDEL; } else if (cur) cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; - /* - * If it's the case where the directory code is running - * with no block reservation, and the deleted block is in - * the middle of its extent, and the resulting insert - * of an extent would cause transformation to btree format, - * then reject it. The calling code will then swap - * blocks around instead. - * We have to do this now, rather than waiting for the - * conversion to btree format, since the transaction - * will be dirty. - */ - if (!wasdel && xfs_trans_get_block_res(tp) == 0 && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ - XFS_IFORK_MAXEXT(ip, whichfork) && - del.br_startoff > got.br_startoff && - del.br_startoff + del.br_blockcount < - got.br_startoff + got.br_blockcount) { - error = -ENOSPC; - goto error0; - } + error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, &tmp_logflags, whichfork); logflags |= tmp_logflags; if (error) goto error0; + + if (!isrt && wasdel) + xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false); + bno = del.br_startoff - 1; nodelete: /* diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 1637c37bfbaa..6282f6e708af 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -461,7 +461,7 @@ xfs_bmbt_alloc_block( * reservation amount is insufficient then we may fail a * block allocation here and corrupt the filesystem. */ - args.minleft = xfs_trans_get_block_res(args.tp); + args.minleft = args.tp->t_blk_res; } else if (cur->bc_private.b.flist->xbf_low) { args.type = XFS_ALLOCTYPE_START_BNO; } else { @@ -470,7 +470,7 @@ xfs_bmbt_alloc_block( args.minlen = args.maxlen = args.prod = 1; args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; - if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { + if (!args.wasdel && args.tp->t_blk_res == 0) { error = -ENOSPC; goto error0; } @@ -531,7 +531,6 @@ xfs_bmbt_free_block( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); - xfs_trans_binval(tp, bp); return 0; } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a0eb18ce3ad3..1f88e1ce770f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -294,6 +294,21 @@ xfs_btree_sblock_verify_crc( return true; } +static int +xfs_btree_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + int error; + + error = cur->bc_ops->free_block(cur, bp); + if (!error) { + xfs_trans_binval(cur->bc_tp, bp); + XFS_BTREE_STATS_INC(cur, free); + } + return error; +} + /* * Delete the btree cursor. */ @@ -3209,6 +3224,7 @@ xfs_btree_kill_iroot( int level; int index; int numrecs; + int error; #ifdef DEBUG union xfs_btree_ptr ptr; int i; @@ -3272,8 +3288,6 @@ xfs_btree_kill_iroot( cpp = xfs_btree_ptr_addr(cur, 1, cblock); #ifdef DEBUG for (i = 0; i < numrecs; i++) { - int error; - error = xfs_btree_check_ptr(cur, cpp, i, level - 1); if (error) { XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); @@ -3283,8 +3297,11 @@ xfs_btree_kill_iroot( #endif xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); - cur->bc_ops->free_block(cur, cbp); - XFS_BTREE_STATS_INC(cur, free); + error = xfs_btree_free_block(cur, cbp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } cur->bc_bufs[level - 1] = NULL; be16_add_cpu(&block->bb_level, -1); @@ -3317,14 +3334,12 @@ xfs_btree_kill_root( */ cur->bc_ops->set_root(cur, newroot, -1); - error = cur->bc_ops->free_block(cur, bp); + error = xfs_btree_free_block(cur, bp); if (error) { XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } - XFS_BTREE_STATS_INC(cur, free); - cur->bc_bufs[level] = NULL; cur->bc_ra[level] = 0; cur->bc_nlevels--; @@ -3830,10 +3845,9 @@ xfs_btree_delrec( } /* Free the deleted block. */ - error = cur->bc_ops->free_block(cur, rbp); + error = xfs_btree_free_block(cur, rbp); if (error) goto error0; - XFS_BTREE_STATS_INC(cur, free); /* * If we joined with the left neighbor, set the buffer in the diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index b14bbd6bb05f..8d4d8bce41bf 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -641,6 +641,22 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) */ #define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ +/* + * Entries are packed toward the top as tight as possible. + */ +typedef struct xfs_attr_shortform { + struct xfs_attr_sf_hdr { /* constant-structure header block */ + __be16 totsize; /* total bytes in shortform list */ + __u8 count; /* count of active entries */ + } hdr; + struct xfs_attr_sf_entry { + __uint8_t namelen; /* actual length of name (no NULL) */ + __uint8_t valuelen; /* actual length of value (no NULL) */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + __uint8_t nameval[1]; /* name & value bytes concatenated */ + } list[1]; /* variable sized array */ +} xfs_attr_shortform_t; + typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ __be16 base; /* base of free region */ __be16 size; /* length of free region */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 2fb53a5c0a74..af0f9d171f8a 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -176,7 +176,7 @@ xfs_dir_isempty( { xfs_dir2_sf_hdr_t *sfp; - ASSERT(S_ISDIR(dp->i_d.di_mode)); + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (dp->i_d.di_size == 0) /* might happen during shutdown. */ return 1; if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) @@ -231,7 +231,7 @@ xfs_dir_init( struct xfs_da_args *args; int error; - ASSERT(S_ISDIR(dp->i_d.di_mode)); + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); if (error) return error; @@ -266,7 +266,7 @@ xfs_dir_createname( int rval; int v; /* type-checking value */ - ASSERT(S_ISDIR(dp->i_d.di_mode)); + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (inum) { rval = xfs_dir_ino_validate(tp->t_mountp, inum); if (rval) @@ -364,7 +364,7 @@ xfs_dir_lookup( int v; /* type-checking value */ int lock_mode; - ASSERT(S_ISDIR(dp->i_d.di_mode)); + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); /* @@ -443,7 +443,7 @@ xfs_dir_removename( int rval; int v; /* type-checking value */ - ASSERT(S_ISDIR(dp->i_d.di_mode)); + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); @@ -505,7 +505,7 @@ xfs_dir_replace( int rval; int v; /* type-checking value */ - ASSERT(S_ISDIR(dp->i_d.di_mode)); + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); rval = xfs_dir_ino_validate(tp->t_mountp, inum); if (rval) diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 63ee03db796c..75a557432d0f 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -2235,6 +2235,9 @@ xfs_dir2_node_trim_free( dp = args->dp; tp = args->trans; + + *rvalp = 0; + /* * Read the freespace block. */ @@ -2255,7 +2258,6 @@ xfs_dir2_node_trim_free( */ if (freehdr.nused > 0) { xfs_trans_brelse(tp, bp); - *rvalp = 0; return 0; } /* diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 66d702e6b9ff..22297f9b0fd5 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2403,8 +2403,8 @@ xfs_ialloc_compute_maxlevels( maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; - minleafrecs = mp->m_alloc_mnr[0]; - minnoderecs = mp->m_alloc_mnr[1]; + minleafrecs = mp->m_inobt_mnr[0]; + minnoderecs = mp->m_inobt_mnr[1]; maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; for (level = 1; maxblocks > 1; level++) maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index c679f3c05b63..89c21d771e35 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -125,16 +125,8 @@ xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - xfs_fsblock_t fsbno; - int error; - - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)); - error = xfs_free_extent(cur->bc_tp, fsbno, 1); - if (error) - return error; - - xfs_trans_binval(cur->bc_tp, bp); - return error; + return xfs_free_extent(cur->bc_tp, + XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1); } STATIC int diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 1aabfda669b0..9d9559eb2835 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -195,28 +195,50 @@ xfs_imap_to_bp( } void -xfs_dinode_from_disk( - xfs_icdinode_t *to, - xfs_dinode_t *from) +xfs_inode_from_disk( + struct xfs_inode *ip, + struct xfs_dinode *from) { - to->di_magic = be16_to_cpu(from->di_magic); - to->di_mode = be16_to_cpu(from->di_mode); - to->di_version = from ->di_version; + struct xfs_icdinode *to = &ip->i_d; + struct inode *inode = VFS_I(ip); + + + /* + * Convert v1 inodes immediately to v2 inode format as this is the + * minimum inode version format we support in the rest of the code. + */ + to->di_version = from->di_version; + if (to->di_version == 1) { + set_nlink(inode, be16_to_cpu(from->di_onlink)); + to->di_projid_lo = 0; + to->di_projid_hi = 0; + to->di_version = 2; + } else { + set_nlink(inode, be32_to_cpu(from->di_nlink)); + to->di_projid_lo = be16_to_cpu(from->di_projid_lo); + to->di_projid_hi = be16_to_cpu(from->di_projid_hi); + } + to->di_format = from->di_format; - to->di_onlink = be16_to_cpu(from->di_onlink); to->di_uid = be32_to_cpu(from->di_uid); to->di_gid = be32_to_cpu(from->di_gid); - to->di_nlink = be32_to_cpu(from->di_nlink); - to->di_projid_lo = be16_to_cpu(from->di_projid_lo); - to->di_projid_hi = be16_to_cpu(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); to->di_flushiter = be16_to_cpu(from->di_flushiter); - to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); - to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); - to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); - to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); - to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); - to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); + + /* + * Time is signed, so need to convert to signed 32 bit before + * storing in inode timestamp which may be 64 bit. Otherwise + * a time before epoch is converted to a time long after epoch + * on 64 bit systems. + */ + inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec); + inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec); + inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec); + inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); + inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); + inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); + inode->i_generation = be32_to_cpu(from->di_gen); + inode->i_mode = be16_to_cpu(from->di_mode); + to->di_size = be64_to_cpu(from->di_size); to->di_nblocks = be64_to_cpu(from->di_nblocks); to->di_extsize = be32_to_cpu(from->di_extsize); @@ -227,42 +249,96 @@ xfs_dinode_from_disk( to->di_dmevmask = be32_to_cpu(from->di_dmevmask); to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); - to->di_gen = be32_to_cpu(from->di_gen); if (to->di_version == 3) { - to->di_changecount = be64_to_cpu(from->di_changecount); + inode->i_version = be64_to_cpu(from->di_changecount); to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); to->di_flags2 = be64_to_cpu(from->di_flags2); - to->di_ino = be64_to_cpu(from->di_ino); - to->di_lsn = be64_to_cpu(from->di_lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); - uuid_copy(&to->di_uuid, &from->di_uuid); } } void -xfs_dinode_to_disk( - xfs_dinode_t *to, - xfs_icdinode_t *from) +xfs_inode_to_disk( + struct xfs_inode *ip, + struct xfs_dinode *to, + xfs_lsn_t lsn) +{ + struct xfs_icdinode *from = &ip->i_d; + struct inode *inode = VFS_I(ip); + + to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + to->di_onlink = 0; + + to->di_version = from->di_version; + to->di_format = from->di_format; + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + + memset(to->di_pad, 0, sizeof(to->di_pad)); + to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); + to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec); + to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec); + to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); + to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec); + to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); + to->di_nlink = cpu_to_be32(inode->i_nlink); + to->di_gen = cpu_to_be32(inode->i_generation); + to->di_mode = cpu_to_be16(inode->i_mode); + + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(inode->i_version); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_flags2 = cpu_to_be64(from->di_flags2); + + to->di_ino = cpu_to_be64(ip->i_ino); + to->di_lsn = cpu_to_be64(lsn); + memset(to->di_pad2, 0, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); + } +} + +void +xfs_log_dinode_to_disk( + struct xfs_log_dinode *from, + struct xfs_dinode *to) { to->di_magic = cpu_to_be16(from->di_magic); to->di_mode = cpu_to_be16(from->di_mode); - to->di_version = from ->di_version; + to->di_version = from->di_version; to->di_format = from->di_format; - to->di_onlink = cpu_to_be16(from->di_onlink); + to->di_onlink = 0; to->di_uid = cpu_to_be32(from->di_uid); to->di_gid = cpu_to_be32(from->di_gid); to->di_nlink = cpu_to_be32(from->di_nlink); to->di_projid_lo = cpu_to_be16(from->di_projid_lo); to->di_projid_hi = cpu_to_be16(from->di_projid_hi); memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); + to->di_size = cpu_to_be64(from->di_size); to->di_nblocks = cpu_to_be64(from->di_nblocks); to->di_extsize = cpu_to_be32(from->di_extsize); @@ -367,13 +443,10 @@ xfs_iread( !(mp->m_flags & XFS_MOUNT_IKEEP)) { /* initialise the on-disk inode core */ memset(&ip->i_d, 0, sizeof(ip->i_d)); - ip->i_d.di_magic = XFS_DINODE_MAGIC; - ip->i_d.di_gen = prandom_u32(); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + VFS_I(ip)->i_generation = prandom_u32(); + if (xfs_sb_version_hascrc(&mp->m_sb)) ip->i_d.di_version = 3; - ip->i_d.di_ino = ip->i_ino; - uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid); - } else + else ip->i_d.di_version = 2; return 0; } @@ -403,7 +476,7 @@ xfs_iread( * Otherwise, just get the truly permanent information. */ if (dip->di_mode) { - xfs_dinode_from_disk(&ip->i_d, dip); + xfs_inode_from_disk(ip, dip); error = xfs_iformat_fork(ip, dip); if (error) { #ifdef DEBUG @@ -417,16 +490,10 @@ xfs_iread( * Partial initialisation of the in-core inode. Just the bits * that xfs_ialloc won't overwrite or relies on being correct. */ - ip->i_d.di_magic = be16_to_cpu(dip->di_magic); ip->i_d.di_version = dip->di_version; - ip->i_d.di_gen = be32_to_cpu(dip->di_gen); + VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); - if (dip->di_version == 3) { - ip->i_d.di_ino = be64_to_cpu(dip->di_ino); - uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); - } - /* * Make sure to pull in the mode here as well in * case the inode is released without being used. @@ -434,25 +501,10 @@ xfs_iread( * the inode is already free and not try to mess * with the uninitialized part of it. */ - ip->i_d.di_mode = 0; - } - - /* - * Automatically convert version 1 inode formats in memory to version 2 - * inode format. If the inode is modified, it will get logged and - * rewritten as a version 2 inode. We can do this because we set the - * superblock feature bit for v2 inodes unconditionally during mount - * and it means the reast of the code can assume the inode version is 2 - * or higher. - */ - if (ip->i_d.di_version == 1) { - ip->i_d.di_version = 2; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - ip->i_d.di_nlink = ip->i_d.di_onlink; - ip->i_d.di_onlink = 0; - xfs_set_projid(ip, 0); + VFS_I(ip)->i_mode = 0; } + ASSERT(ip->i_d.di_version >= 2); ip->i_delayed_blks = 0; /* diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 9308c47f2a52..7c4dd321b215 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -20,7 +20,36 @@ struct xfs_inode; struct xfs_dinode; -struct xfs_icdinode; + +/* + * In memory representation of the XFS inode. This is held in the in-core struct + * xfs_inode and represents the current on disk values but the structure is not + * in on-disk format. That is, this structure is always translated to on-disk + * format specific structures at the appropriate time. + */ +struct xfs_icdinode { + __int8_t di_version; /* inode version */ + __int8_t di_format; /* format of di_c data */ + __uint16_t di_flushiter; /* incremented on flush */ + __uint32_t di_uid; /* owner's user id */ + __uint32_t di_gid; /* owner's group id */ + __uint16_t di_projid_lo; /* lower part of owner's project id */ + __uint16_t di_projid_hi; /* higher part of owner's project id */ + xfs_fsize_t di_size; /* number of bytes in file */ + xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ + xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ + xfs_extnum_t di_nextents; /* number of extents in data fork */ + xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + __int8_t di_aformat; /* format of attr fork's data */ + __uint32_t di_dmevmask; /* DMIG event mask */ + __uint16_t di_dmstate; /* DMIG state info */ + __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + + __uint64_t di_flags2; /* more random flags */ + + xfs_ictimestamp_t di_crtime; /* time created */ +}; /* * Inode location information. Stored in the inode and passed to @@ -38,8 +67,11 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); -void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from); -void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from); +void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, + xfs_lsn_t lsn); +void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); +void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, + struct xfs_dinode *to); #if defined(DEBUG) void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 0defbd02f62d..11faf7df14c8 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -31,6 +31,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_attr_sf.h" +#include "xfs_da_format.h" kmem_zone_t *xfs_ifork_zone; @@ -120,7 +121,7 @@ xfs_iformat_fork( return -EFSCORRUPTED; } - switch (ip->i_d.di_mode & S_IFMT) { + switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: case S_IFBLK: diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 265314690415..d54a8018b079 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -290,6 +290,7 @@ typedef struct xfs_inode_log_format_64 { __int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_64_t; + /* * Flags for xfs_trans_log_inode flags field. */ @@ -360,15 +361,15 @@ typedef struct xfs_ictimestamp { } xfs_ictimestamp_t; /* - * NOTE: This structure must be kept identical to struct xfs_dinode - * except for the endianness annotations. + * Define the format of the inode core that is logged. This structure must be + * kept identical to struct xfs_dinode except for the endianness annotations. */ -typedef struct xfs_icdinode { +struct xfs_log_dinode { __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ __uint16_t di_mode; /* mode and type of file */ __int8_t di_version; /* inode version */ __int8_t di_format; /* format of di_c data */ - __uint16_t di_onlink; /* old number of links to file */ + __uint8_t di_pad3[2]; /* unused in v2/3 inodes */ __uint32_t di_uid; /* owner's user id */ __uint32_t di_gid; /* owner's group id */ __uint32_t di_nlink; /* number of links to file */ @@ -407,13 +408,13 @@ typedef struct xfs_icdinode { uuid_t di_uuid; /* UUID of the filesystem */ /* structure must be padded to 64 bit alignment */ -} xfs_icdinode_t; +}; -static inline uint xfs_icdinode_size(int version) +static inline uint xfs_log_dinode_size(int version) { if (version == 3) - return sizeof(struct xfs_icdinode); - return offsetof(struct xfs_icdinode, di_next_unlinked); + return sizeof(struct xfs_log_dinode); + return offsetof(struct xfs_log_dinode, di_next_unlinked); } /* @@ -495,6 +496,8 @@ enum xfs_blft { XFS_BLFT_ATTR_LEAF_BUF, XFS_BLFT_ATTR_RMT_BUF, XFS_BLFT_SB_BUF, + XFS_BLFT_RTBITMAP_BUF, + XFS_BLFT_RTSUMMARY_BUF, XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), }; diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index f51078f1e92a..8eed51275bb3 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -37,7 +37,7 @@ typedef __uint16_t xfs_qwarncnt_t; #define XFS_DQ_PROJ 0x0002 /* project quota */ #define XFS_DQ_GROUP 0x0004 /* a group quota */ #define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ +#define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */ #define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) @@ -116,6 +116,7 @@ typedef __uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ #define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ +#define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 9b59ffa1fc19..951c044e24e4 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -42,6 +42,31 @@ */ /* + * Real time buffers need verifiers to avoid runtime warnings during IO. + * We don't have anything to verify, however, so these are just dummy + * operations. + */ +static void +xfs_rtbuf_verify_read( + struct xfs_buf *bp) +{ + return; +} + +static void +xfs_rtbuf_verify_write( + struct xfs_buf *bp) +{ + return; +} + +const struct xfs_buf_ops xfs_rtbuf_ops = { + .name = "rtbuf", + .verify_read = xfs_rtbuf_verify_read, + .verify_write = xfs_rtbuf_verify_write, +}; + +/* * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ @@ -68,9 +93,12 @@ xfs_rtbuf_get( ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp, NULL); + mp->m_bsize, 0, &bp, &xfs_rtbuf_ops); if (error) return error; + + xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF + : XFS_BLFT_RTBITMAP_BUF); *bpp = bp; return 0; } @@ -983,7 +1011,7 @@ xfs_rtfree_extent( mp->m_sb.sb_rextents) { if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; - *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0; + *(__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0; xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); } return 0; diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index b25bb9a343f3..961e6475a309 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -27,7 +27,6 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, extern void xfs_perag_put(struct xfs_perag *pag); extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); -extern void xfs_sb_calc_crc(struct xfs_buf *bp); extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 15c3ceb845b9..81ac870834da 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -53,6 +53,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; +extern const struct xfs_buf_ops xfs_rtbuf_ops; /* * Transaction types. Used to distinguish types of buffers. These never reach diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5c57b7b40728..d445a64b979e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -36,6 +36,21 @@ #include <linux/pagevec.h> #include <linux/writeback.h> +/* flags for direct write completions */ +#define XFS_DIO_FLAG_UNWRITTEN (1 << 0) +#define XFS_DIO_FLAG_APPEND (1 << 1) + +/* + * structure owned by writepages passed to individual writepage calls + */ +struct xfs_writepage_ctx { + struct xfs_bmbt_irec imap; + bool imap_valid; + unsigned int io_type; + struct xfs_ioend *ioend; + sector_t last_block; +}; + void xfs_count_page_state( struct page *page, @@ -214,10 +229,12 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + /* + * Set an error if the mount has shut down and proceed with end I/O + * processing so it can perform whatever cleanups are necessary. + */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) ioend->io_error = -EIO; - goto done; - } /* * For unwritten extents we need to issue transactions to convert a @@ -265,7 +282,7 @@ xfs_alloc_ioend( */ atomic_set(&ioend->io_remaining, 1); ioend->io_error = 0; - ioend->io_list = NULL; + INIT_LIST_HEAD(&ioend->io_list); ioend->io_type = type; ioend->io_inode = inode; ioend->io_buffer_head = NULL; @@ -283,8 +300,7 @@ xfs_map_blocks( struct inode *inode, loff_t offset, struct xfs_bmbt_irec *imap, - int type, - int nonblocking) + int type) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -300,12 +316,7 @@ xfs_map_blocks( if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { - if (nonblocking) - return -EAGAIN; - xfs_ilock(ip, XFS_ILOCK_SHARED); - } - + xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes); @@ -341,7 +352,7 @@ xfs_map_blocks( return 0; } -STATIC int +STATIC bool xfs_imap_valid( struct inode *inode, struct xfs_bmbt_irec *imap, @@ -414,8 +425,7 @@ xfs_start_buffer_writeback( STATIC void xfs_start_page_writeback( struct page *page, - int clear_dirty, - int buffers) + int clear_dirty) { ASSERT(PageLocked(page)); ASSERT(!PageWriteback(page)); @@ -434,10 +444,6 @@ xfs_start_page_writeback( set_page_writeback_keepwrite(page); unlock_page(page); - - /* If no buffers on the page are to be written, finish it here */ - if (!buffers) - end_page_writeback(page); } static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) @@ -446,153 +452,101 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) } /* - * Submit all of the bios for all of the ioends we have saved up, covering the - * initial writepage page and also any probed pages. - * - * Because we may have multiple ioends spanning a page, we need to start - * writeback on all the buffers before we submit them for I/O. If we mark the - * buffers as we got, then we can end up with a page that only has buffers - * marked async write and I/O complete on can occur before we mark the other - * buffers async write. - * - * The end result of this is that we trip a bug in end_page_writeback() because - * we call it twice for the one page as the code in end_buffer_async_write() - * assumes that all buffers on the page are started at the same time. - * - * The fix is two passes across the ioend list - one to start writeback on the - * buffer_heads, and then submit them for I/O on the second pass. + * Submit all of the bios for an ioend. We are only passed a single ioend at a + * time; the caller is responsible for chaining prior to submission. * * If @fail is non-zero, it means that we have a situation where some part of * the submission process has failed after we have marked paged for writeback * and unlocked them. In this situation, we need to fail the ioend chain rather * than submit it to IO. This typically only happens on a filesystem shutdown. */ -STATIC void +STATIC int xfs_submit_ioend( struct writeback_control *wbc, xfs_ioend_t *ioend, - int fail) + int status) { - xfs_ioend_t *head = ioend; - xfs_ioend_t *next; struct buffer_head *bh; struct bio *bio; sector_t lastblock = 0; - /* Pass 1 - start writeback */ - do { - next = ioend->io_list; - for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) - xfs_start_buffer_writeback(bh); - } while ((ioend = next) != NULL); + /* Reserve log space if we might write beyond the on-disk inode size. */ + if (!status && + ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + status = xfs_setfilesize_trans_alloc(ioend); + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (status) { + ioend->io_error = status; + xfs_finish_ioend(ioend); + return status; + } - /* Pass 2 - submit I/O */ - ioend = head; - do { - next = ioend->io_list; - bio = NULL; + bio = NULL; + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { - /* - * If we are failing the IO now, just mark the ioend with an - * error and finish it. This will run IO completion immediately - * as there is only one reference to the ioend at this point in - * time. - */ - if (fail) { - ioend->io_error = fail; - xfs_finish_ioend(ioend); - continue; + if (!bio) { +retry: + bio = xfs_alloc_ioend_bio(bh); + } else if (bh->b_blocknr != lastblock + 1) { + xfs_submit_ioend_bio(wbc, ioend, bio); + goto retry; } - for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { - - if (!bio) { - retry: - bio = xfs_alloc_ioend_bio(bh); - } else if (bh->b_blocknr != lastblock + 1) { - xfs_submit_ioend_bio(wbc, ioend, bio); - goto retry; - } - - if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { - xfs_submit_ioend_bio(wbc, ioend, bio); - goto retry; - } - - lastblock = bh->b_blocknr; - } - if (bio) + if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { xfs_submit_ioend_bio(wbc, ioend, bio); - xfs_finish_ioend(ioend); - } while ((ioend = next) != NULL); -} - -/* - * Cancel submission of all buffer_heads so far in this endio. - * Toss the endio too. Only ever called for the initial page - * in a writepage request, so only ever one page. - */ -STATIC void -xfs_cancel_ioend( - xfs_ioend_t *ioend) -{ - xfs_ioend_t *next; - struct buffer_head *bh, *next_bh; - - do { - next = ioend->io_list; - bh = ioend->io_buffer_head; - do { - next_bh = bh->b_private; - clear_buffer_async_write(bh); - /* - * The unwritten flag is cleared when added to the - * ioend. We're not submitting for I/O so mark the - * buffer unwritten again for next time around. - */ - if (ioend->io_type == XFS_IO_UNWRITTEN) - set_buffer_unwritten(bh); - unlock_buffer(bh); - } while ((bh = next_bh) != NULL); + goto retry; + } - mempool_free(ioend, xfs_ioend_pool); - } while ((ioend = next) != NULL); + lastblock = bh->b_blocknr; + } + if (bio) + xfs_submit_ioend_bio(wbc, ioend, bio); + xfs_finish_ioend(ioend); + return 0; } /* * Test to see if we've been building up a completion structure for * earlier buffers -- if so, we try to append to this ioend if we * can, otherwise we finish off any current ioend and start another. - * Return true if we've finished the given ioend. + * Return the ioend we finished off so that the caller can submit it + * once it has finished processing the dirty page. */ STATIC void xfs_add_to_ioend( struct inode *inode, struct buffer_head *bh, xfs_off_t offset, - unsigned int type, - xfs_ioend_t **result, - int need_ioend) + struct xfs_writepage_ctx *wpc, + struct list_head *iolist) { - xfs_ioend_t *ioend = *result; - - if (!ioend || need_ioend || type != ioend->io_type) { - xfs_ioend_t *previous = *result; - - ioend = xfs_alloc_ioend(inode, type); - ioend->io_offset = offset; - ioend->io_buffer_head = bh; - ioend->io_buffer_tail = bh; - if (previous) - previous->io_list = ioend; - *result = ioend; + if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || + bh->b_blocknr != wpc->last_block + 1 || + offset != wpc->ioend->io_offset + wpc->ioend->io_size) { + struct xfs_ioend *new; + + if (wpc->ioend) + list_add(&wpc->ioend->io_list, iolist); + + new = xfs_alloc_ioend(inode, wpc->io_type); + new->io_offset = offset; + new->io_buffer_head = bh; + new->io_buffer_tail = bh; + wpc->ioend = new; } else { - ioend->io_buffer_tail->b_private = bh; - ioend->io_buffer_tail = bh; + wpc->ioend->io_buffer_tail->b_private = bh; + wpc->ioend->io_buffer_tail = bh; } bh->b_private = NULL; - ioend->io_size += bh->b_size; + wpc->ioend->io_size += bh->b_size; + wpc->last_block = bh->b_blocknr; + xfs_start_buffer_writeback(bh); } STATIC void @@ -678,183 +632,6 @@ xfs_check_page_type( return false; } -/* - * Allocate & map buffers for page given the extent map. Write it out. - * except for the original page of a writepage, this is called on - * delalloc/unwritten pages only, for the original page it is possible - * that the page has no mapping at all. - */ -STATIC int -xfs_convert_page( - struct inode *inode, - struct page *page, - loff_t tindex, - struct xfs_bmbt_irec *imap, - xfs_ioend_t **ioendp, - struct writeback_control *wbc) -{ - struct buffer_head *bh, *head; - xfs_off_t end_offset; - unsigned long p_offset; - unsigned int type; - int len, page_dirty; - int count = 0, done = 0, uptodate = 1; - xfs_off_t offset = page_offset(page); - - if (page->index != tindex) - goto fail; - if (!trylock_page(page)) - goto fail; - if (PageWriteback(page)) - goto fail_unlock_page; - if (page->mapping != inode->i_mapping) - goto fail_unlock_page; - if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) - goto fail_unlock_page; - - /* - * page_dirty is initially a count of buffers on the page before - * EOF and is decremented as we move each into a cleanable state. - * - * Derivation: - * - * End offset is the highest offset that this page should represent. - * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) - * will evaluate non-zero and be less than PAGE_CACHE_SIZE and - * hence give us the correct page_dirty count. On any other page, - * it will be zero and in that case we need page_dirty to be the - * count of buffers on the page. - */ - end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, - i_size_read(inode)); - - /* - * If the current map does not span the entire page we are about to try - * to write, then give up. The only way we can write a page that spans - * multiple mappings in a single writeback iteration is via the - * xfs_vm_writepage() function. Data integrity writeback requires the - * entire page to be written in a single attempt, otherwise the part of - * the page we don't write here doesn't get written as part of the data - * integrity sync. - * - * For normal writeback, we also don't attempt to write partial pages - * here as it simply means that write_cache_pages() will see it under - * writeback and ignore the page until some point in the future, at - * which time this will be the only page in the file that needs - * writeback. Hence for more optimal IO patterns, we should always - * avoid partial page writeback due to multiple mappings on a page here. - */ - if (!xfs_imap_valid(inode, imap, end_offset)) - goto fail_unlock_page; - - len = 1 << inode->i_blkbits; - p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), - PAGE_CACHE_SIZE); - p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; - page_dirty = p_offset / len; - - /* - * The moment we find a buffer that doesn't match our current type - * specification or can't be written, abort the loop and start - * writeback. As per the above xfs_imap_valid() check, only - * xfs_vm_writepage() can handle partial page writeback fully - we are - * limited here to the buffers that are contiguous with the current - * ioend, and hence a buffer we can't write breaks that contiguity and - * we have to defer the rest of the IO to xfs_vm_writepage(). - */ - bh = head = page_buffers(page); - do { - if (offset >= end_offset) - break; - if (!buffer_uptodate(bh)) - uptodate = 0; - if (!(PageUptodate(page) || buffer_uptodate(bh))) { - done = 1; - break; - } - - if (buffer_unwritten(bh) || buffer_delay(bh) || - buffer_mapped(bh)) { - if (buffer_unwritten(bh)) - type = XFS_IO_UNWRITTEN; - else if (buffer_delay(bh)) - type = XFS_IO_DELALLOC; - else - type = XFS_IO_OVERWRITE; - - /* - * imap should always be valid because of the above - * partial page end_offset check on the imap. - */ - ASSERT(xfs_imap_valid(inode, imap, offset)); - - lock_buffer(bh); - if (type != XFS_IO_OVERWRITE) - xfs_map_at_offset(inode, bh, imap, offset); - xfs_add_to_ioend(inode, bh, offset, type, - ioendp, done); - - page_dirty--; - count++; - } else { - done = 1; - break; - } - } while (offset += len, (bh = bh->b_this_page) != head); - - if (uptodate && bh == head) - SetPageUptodate(page); - - if (count) { - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) - done = 1; - } - xfs_start_page_writeback(page, !page_dirty, count); - - return done; - fail_unlock_page: - unlock_page(page); - fail: - return 1; -} - -/* - * Convert & write out a cluster of pages in the same extent as defined - * by mp and following the start page. - */ -STATIC void -xfs_cluster_write( - struct inode *inode, - pgoff_t tindex, - struct xfs_bmbt_irec *imap, - xfs_ioend_t **ioendp, - struct writeback_control *wbc, - pgoff_t tlast) -{ - struct pagevec pvec; - int done = 0, i; - - pagevec_init(&pvec, 0); - while (!done && tindex <= tlast) { - unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); - - if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) - break; - - for (i = 0; i < pagevec_count(&pvec); i++) { - done = xfs_convert_page(inode, pvec.pages[i], tindex++, - imap, ioendp, wbc); - if (done) - break; - } - - pagevec_release(&pvec); - cond_resched(); - } -} - STATIC void xfs_vm_invalidatepage( struct page *page, @@ -932,6 +709,164 @@ out_invalidate: } /* + * We implement an immediate ioend submission policy here to avoid needing to + * chain multiple ioends and hence nest mempool allocations which can violate + * forward progress guarantees we need to provide. The current ioend we are + * adding buffers to is cached on the writepage context, and if the new buffer + * does not append to the cached ioend it will create a new ioend and cache that + * instead. + * + * If a new ioend is created and cached, the old ioend is returned and queued + * locally for submission once the entire page is processed or an error has been + * detected. While ioends are submitted immediately after they are completed, + * batching optimisations are provided by higher level block plugging. + * + * At the end of a writeback pass, there will be a cached ioend remaining on the + * writepage context that the caller will need to submit. + */ +static int +xfs_writepage_map( + struct xfs_writepage_ctx *wpc, + struct writeback_control *wbc, + struct inode *inode, + struct page *page, + loff_t offset, + __uint64_t end_offset) +{ + LIST_HEAD(submit_list); + struct xfs_ioend *ioend, *next; + struct buffer_head *bh, *head; + ssize_t len = 1 << inode->i_blkbits; + int error = 0; + int count = 0; + int uptodate = 1; + + bh = head = page_buffers(page); + offset = page_offset(page); + do { + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + + /* + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so skip + * buffers covering holes here. + */ + if (!buffer_mapped(bh) && buffer_uptodate(bh)) { + wpc->imap_valid = false; + continue; + } + + if (buffer_unwritten(bh)) { + if (wpc->io_type != XFS_IO_UNWRITTEN) { + wpc->io_type = XFS_IO_UNWRITTEN; + wpc->imap_valid = false; + } + } else if (buffer_delay(bh)) { + if (wpc->io_type != XFS_IO_DELALLOC) { + wpc->io_type = XFS_IO_DELALLOC; + wpc->imap_valid = false; + } + } else if (buffer_uptodate(bh)) { + if (wpc->io_type != XFS_IO_OVERWRITE) { + wpc->io_type = XFS_IO_OVERWRITE; + wpc->imap_valid = false; + } + } else { + if (PageUptodate(page)) + ASSERT(buffer_mapped(bh)); + /* + * This buffer is not uptodate and will not be + * written to disk. Ensure that we will put any + * subsequent writeable buffers into a new + * ioend. + */ + wpc->imap_valid = false; + continue; + } + + if (wpc->imap_valid) + wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, + offset); + if (!wpc->imap_valid) { + error = xfs_map_blocks(inode, offset, &wpc->imap, + wpc->io_type); + if (error) + goto out; + wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, + offset); + } + if (wpc->imap_valid) { + lock_buffer(bh); + if (wpc->io_type != XFS_IO_OVERWRITE) + xfs_map_at_offset(inode, bh, &wpc->imap, offset); + xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list); + count++; + } + + } while (offset += len, ((bh = bh->b_this_page) != head)); + + if (uptodate && bh == head) + SetPageUptodate(page); + + ASSERT(wpc->ioend || list_empty(&submit_list)); + +out: + /* + * On error, we have to fail the ioend here because we have locked + * buffers in the ioend. If we don't do this, we'll deadlock + * invalidating the page as that tries to lock the buffers on the page. + * Also, because we may have set pages under writeback, we have to make + * sure we run IO completion to mark the error state of the IO + * appropriately, so we can't cancel the ioend directly here. That means + * we have to mark this page as under writeback if we included any + * buffers from it in the ioend chain so that completion treats it + * correctly. + * + * If we didn't include the page in the ioend, the on error we can + * simply discard and unlock it as there are no other users of the page + * or it's buffers right now. The caller will still need to trigger + * submission of outstanding ioends on the writepage context so they are + * treated correctly on error. + */ + if (count) { + xfs_start_page_writeback(page, !error); + + /* + * Preserve the original error if there was one, otherwise catch + * submission errors here and propagate into subsequent ioend + * submissions. + */ + list_for_each_entry_safe(ioend, next, &submit_list, io_list) { + int error2; + + list_del_init(&ioend->io_list); + error2 = xfs_submit_ioend(wbc, ioend, error); + if (error2 && !error) + error = error2; + } + } else if (error) { + xfs_aops_discard_page(page); + ClearPageUptodate(page); + unlock_page(page); + } else { + /* + * We can end up here with no error and nothing to write if we + * race with a partial page truncate on a sub-page block sized + * filesystem. In that case we need to mark the page clean. + */ + xfs_start_page_writeback(page, 1); + end_page_writeback(page); + } + + mapping_set_error(page->mapping, error); + return error; +} + +/* * Write out a dirty page. * * For delalloc space on the page we need to allocate space and flush it. @@ -940,22 +875,16 @@ out_invalidate: * For any other dirty buffer heads on the page we should flush them. */ STATIC int -xfs_vm_writepage( +xfs_do_writepage( struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc, + void *data) { + struct xfs_writepage_ctx *wpc = data; struct inode *inode = page->mapping->host; - struct buffer_head *bh, *head; - struct xfs_bmbt_irec imap; - xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; - unsigned int type; __uint64_t end_offset; - pgoff_t end_index, last_index; - ssize_t len; - int err, imap_valid = 0, uptodate = 1; - int count = 0; - int nonblocking = 0; + pgoff_t end_index; trace_xfs_writepage(inode, page, 0, 0); @@ -982,12 +911,9 @@ xfs_vm_writepage( if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) goto redirty; - /* Is this page beyond the end of the file? */ - offset = i_size_read(inode); - end_index = offset >> PAGE_CACHE_SHIFT; - last_index = (offset - 1) >> PAGE_CACHE_SHIFT; - /* + * Is this page beyond the end of the file? + * * The page index is less than the end_index, adjust the end_offset * to the highest offset that this page should represent. * ----------------------------------------------------- @@ -998,6 +924,8 @@ xfs_vm_writepage( * | desired writeback range | see else | * ---------------------------------^------------------| */ + offset = i_size_read(inode); + end_index = offset >> PAGE_CACHE_SHIFT; if (page->index < end_index) end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; else { @@ -1049,152 +977,7 @@ xfs_vm_writepage( end_offset = offset; } - len = 1 << inode->i_blkbits; - - bh = head = page_buffers(page); - offset = page_offset(page); - type = XFS_IO_OVERWRITE; - - if (wbc->sync_mode == WB_SYNC_NONE) - nonblocking = 1; - - do { - int new_ioend = 0; - - if (offset >= end_offset) - break; - if (!buffer_uptodate(bh)) - uptodate = 0; - - /* - * set_page_dirty dirties all buffers in a page, independent - * of their state. The dirty state however is entirely - * meaningless for holes (!mapped && uptodate), so skip - * buffers covering holes here. - */ - if (!buffer_mapped(bh) && buffer_uptodate(bh)) { - imap_valid = 0; - continue; - } - - if (buffer_unwritten(bh)) { - if (type != XFS_IO_UNWRITTEN) { - type = XFS_IO_UNWRITTEN; - imap_valid = 0; - } - } else if (buffer_delay(bh)) { - if (type != XFS_IO_DELALLOC) { - type = XFS_IO_DELALLOC; - imap_valid = 0; - } - } else if (buffer_uptodate(bh)) { - if (type != XFS_IO_OVERWRITE) { - type = XFS_IO_OVERWRITE; - imap_valid = 0; - } - } else { - if (PageUptodate(page)) - ASSERT(buffer_mapped(bh)); - /* - * This buffer is not uptodate and will not be - * written to disk. Ensure that we will put any - * subsequent writeable buffers into a new - * ioend. - */ - imap_valid = 0; - continue; - } - - if (imap_valid) - imap_valid = xfs_imap_valid(inode, &imap, offset); - if (!imap_valid) { - /* - * If we didn't have a valid mapping then we need to - * put the new mapping into a separate ioend structure. - * This ensures non-contiguous extents always have - * separate ioends, which is particularly important - * for unwritten extent conversion at I/O completion - * time. - */ - new_ioend = 1; - err = xfs_map_blocks(inode, offset, &imap, type, - nonblocking); - if (err) - goto error; - imap_valid = xfs_imap_valid(inode, &imap, offset); - } - if (imap_valid) { - lock_buffer(bh); - if (type != XFS_IO_OVERWRITE) - xfs_map_at_offset(inode, bh, &imap, offset); - xfs_add_to_ioend(inode, bh, offset, type, &ioend, - new_ioend); - count++; - } - - if (!iohead) - iohead = ioend; - - } while (offset += len, ((bh = bh->b_this_page) != head)); - - if (uptodate && bh == head) - SetPageUptodate(page); - - xfs_start_page_writeback(page, 1, count); - - /* if there is no IO to be submitted for this page, we are done */ - if (!ioend) - return 0; - - ASSERT(iohead); - - /* - * Any errors from this point onwards need tobe reported through the IO - * completion path as we have marked the initial page as under writeback - * and unlocked it. - */ - if (imap_valid) { - xfs_off_t end_index; - - end_index = imap.br_startoff + imap.br_blockcount; - - /* to bytes */ - end_index <<= inode->i_blkbits; - - /* to pages */ - end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; - - /* check against file size */ - if (end_index > last_index) - end_index = last_index; - - xfs_cluster_write(inode, page->index + 1, &imap, &ioend, - wbc, end_index); - } - - - /* - * Reserve log space if we might write beyond the on-disk inode size. - */ - err = 0; - if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) - err = xfs_setfilesize_trans_alloc(ioend); - - xfs_submit_ioend(wbc, iohead, err); - - return 0; - -error: - if (iohead) - xfs_cancel_ioend(iohead); - - if (err == -EAGAIN) - goto redirty; - - xfs_aops_discard_page(page); - ClearPageUptodate(page); - unlock_page(page); - return err; + return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset); redirty: redirty_page_for_writepage(wbc, page); @@ -1203,16 +986,40 @@ redirty: } STATIC int +xfs_vm_writepage( + struct page *page, + struct writeback_control *wbc) +{ + struct xfs_writepage_ctx wpc = { + .io_type = XFS_IO_INVALID, + }; + int ret; + + ret = xfs_do_writepage(page, wbc, &wpc); + if (wpc.ioend) + ret = xfs_submit_ioend(wbc, wpc.ioend, ret); + return ret; +} + +STATIC int xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { + struct xfs_writepage_ctx wpc = { + .io_type = XFS_IO_INVALID, + }; + int ret; + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); if (dax_mapping(mapping)) return dax_writeback_mapping_range(mapping, xfs_find_bdev_for_inode(mapping->host), wbc); - return generic_writepages(mapping, wbc); + ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); + if (wpc.ioend) + ret = xfs_submit_ioend(wbc, wpc.ioend, ret); + return ret; } /* @@ -1242,27 +1049,8 @@ xfs_vm_releasepage( } /* - * When we map a DIO buffer, we may need to attach an ioend that describes the - * type of write IO we are doing. This passes to the completion function the - * operations it needs to perform. If the mapping is for an overwrite wholly - * within the EOF then we don't need an ioend and so we don't allocate one. - * This avoids the unnecessary overhead of allocating and freeing ioends for - * workloads that don't require transactions on IO completion. - * - * If we get multiple mappings in a single IO, we might be mapping different - * types. But because the direct IO can only have a single private pointer, we - * need to ensure that: - * - * a) i) the ioend spans the entire region of unwritten mappings; or - * ii) the ioend spans all the mappings that cross or are beyond EOF; and - * b) if it contains unwritten extents, it is *permanently* marked as such - * - * We could do this by chaining ioends like buffered IO does, but we only - * actually get one IO completion callback from the direct IO, and that spans - * the entire IO regardless of how many mappings and IOs are needed to complete - * the DIO. There is only going to be one reference to the ioend and its life - * cycle is constrained by the DIO completion code. hence we don't need - * reference counting here. + * When we map a DIO buffer, we may need to pass flags to + * xfs_end_io_direct_write to tell it what kind of write IO we are doing. * * Note that for DIO, an IO to the highest supported file block offset (i.e. * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 @@ -1270,68 +1058,26 @@ xfs_vm_releasepage( * extending the file size. We won't know for sure until IO completion is run * and the actual max write offset is communicated to the IO completion * routine. - * - * For DAX page faults, we are preparing to never see unwritten extents here, - * nor should we ever extend the inode size. Hence we will soon have nothing to - * do here for this case, ensuring we don't have to provide an IO completion - * callback to free an ioend that we don't actually need for a fault into the - * page at offset (2^63 - 1FSB) bytes. */ - static void xfs_map_direct( struct inode *inode, struct buffer_head *bh_result, struct xfs_bmbt_irec *imap, - xfs_off_t offset, - bool dax_fault) + xfs_off_t offset) { - struct xfs_ioend *ioend; + uintptr_t *flags = (uintptr_t *)&bh_result->b_private; xfs_off_t size = bh_result->b_size; - int type; - - if (ISUNWRITTEN(imap)) - type = XFS_IO_UNWRITTEN; - else - type = XFS_IO_OVERWRITE; - trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); - - if (dax_fault) { - ASSERT(type == XFS_IO_OVERWRITE); - trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, - imap); - return; - } + trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, + ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap); - if (bh_result->b_private) { - ioend = bh_result->b_private; - ASSERT(ioend->io_size > 0); - ASSERT(offset >= ioend->io_offset); - if (offset + size > ioend->io_offset + ioend->io_size) - ioend->io_size = offset - ioend->io_offset + size; - - if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) - ioend->io_type = XFS_IO_UNWRITTEN; - - trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, - ioend->io_size, ioend->io_type, - imap); - } else if (type == XFS_IO_UNWRITTEN || - offset + size > i_size_read(inode) || - offset + size < 0) { - ioend = xfs_alloc_ioend(inode, type); - ioend->io_offset = offset; - ioend->io_size = size; - - bh_result->b_private = ioend; + if (ISUNWRITTEN(imap)) { + *flags |= XFS_DIO_FLAG_UNWRITTEN; + set_buffer_defer_completion(bh_result); + } else if (offset + size > i_size_read(inode) || offset + size < 0) { + *flags |= XFS_DIO_FLAG_APPEND; set_buffer_defer_completion(bh_result); - - trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, - imap); - } else { - trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, - imap); } } @@ -1502,9 +1248,12 @@ __xfs_get_blocks( if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); /* direct IO needs special help */ - if (create && direct) - xfs_map_direct(inode, bh_result, &imap, offset, - dax_fault); + if (create && direct) { + if (dax_fault) + ASSERT(!ISUNWRITTEN(&imap)); + else + xfs_map_direct(inode, bh_result, &imap, offset); + } } /* @@ -1574,42 +1323,50 @@ xfs_get_blocks_dax_fault( return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); } -static void -__xfs_end_io_direct_write( - struct inode *inode, - struct xfs_ioend *ioend, +/* + * Complete a direct I/O write request. + * + * xfs_map_direct passes us some flags in the private data to tell us what to + * do. If no flags are set, then the write IO is an overwrite wholly within + * the existing allocated file size and so there is nothing for us to do. + * + * Note that in this case the completion can be called in interrupt context, + * whereas if we have flags set we will always be called in task context + * (i.e. from a workqueue). + */ +STATIC int +xfs_end_io_direct_write( + struct kiocb *iocb, loff_t offset, - ssize_t size) + ssize_t size, + void *private) { - struct xfs_mount *mp = XFS_I(inode)->i_mount; + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + uintptr_t flags = (uintptr_t)private; + int error = 0; - if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) - goto out_end_io; + trace_xfs_end_io_direct_write(ip, offset, size); - /* - * dio completion end_io functions are only called on writes if more - * than 0 bytes was written. - */ - ASSERT(size > 0); + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; - /* - * The ioend only maps whole blocks, while the IO may be sector aligned. - * Hence the ioend offset/size may not match the IO offset/size exactly. - * Because we don't map overwrites within EOF into the ioend, the offset - * may not match, but only if the endio spans EOF. Either way, write - * the IO sizes into the ioend so that completion processing does the - * right thing. - */ - ASSERT(offset + size <= ioend->io_offset + ioend->io_size); - ioend->io_size = size; - ioend->io_offset = offset; + if (size <= 0) + return size; /* - * The ioend tells us whether we are doing unwritten extent conversion + * The flags tell us whether we are doing unwritten extent conversions * or an append transaction that updates the on-disk file size. These * cases are the only cases where we should *potentially* be needing * to update the VFS inode size. - * + */ + if (flags == 0) { + ASSERT(offset + size <= i_size_read(inode)); + return 0; + } + + /* * We need to update the in-core inode size here so that we don't end up * with the on-disk inode size being outside the in-core inode size. We * have no other method of updating EOF for AIO, so always do it here @@ -1620,91 +1377,56 @@ __xfs_end_io_direct_write( * here can result in EOF moving backwards and Bad Things Happen when * that occurs. */ - spin_lock(&XFS_I(inode)->i_flags_lock); + spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) i_size_write(inode, offset + size); - spin_unlock(&XFS_I(inode)->i_flags_lock); + spin_unlock(&ip->i_flags_lock); - /* - * If we are doing an append IO that needs to update the EOF on disk, - * do the transaction reserve now so we can use common end io - * processing. Stashing the error (if there is one) in the ioend will - * result in the ioend processing passing on the error if it is - * possible as we can't return it from here. - */ - if (ioend->io_type == XFS_IO_OVERWRITE) - ioend->io_error = xfs_setfilesize_trans_alloc(ioend); + if (flags & XFS_DIO_FLAG_UNWRITTEN) { + trace_xfs_end_io_direct_write_unwritten(ip, offset, size); -out_end_io: - xfs_end_io(&ioend->io_work); - return; -} + error = xfs_iomap_write_unwritten(ip, offset, size); + } else if (flags & XFS_DIO_FLAG_APPEND) { + struct xfs_trans *tp; -/* - * Complete a direct I/O write request. - * - * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. - * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite - * wholly within the EOF and so there is nothing for us to do. Note that in this - * case the completion can be called in interrupt context, whereas if we have an - * ioend we will always be called in task context (i.e. from a workqueue). - */ -STATIC void -xfs_end_io_direct_write( - struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_ioend *ioend = private; - - trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, - ioend ? ioend->io_type : 0, NULL); + trace_xfs_end_io_direct_write_append(ip, offset, size); - if (!ioend) { - ASSERT(offset + size <= i_size_read(inode)); - return; + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); + if (error) { + xfs_trans_cancel(tp); + return error; + } + error = xfs_setfilesize(ip, tp, offset, size); } - __xfs_end_io_direct_write(inode, ioend, offset, size); + return error; } -static inline ssize_t -xfs_vm_do_dio( - struct inode *inode, +STATIC ssize_t +xfs_vm_direct_IO( struct kiocb *iocb, struct iov_iter *iter, - loff_t offset, - void (*endio)(struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private), - int flags) + loff_t offset) { + struct inode *inode = iocb->ki_filp->f_mapping->host; + dio_iodone_t *endio = NULL; + int flags = 0; struct block_device *bdev; - if (IS_DAX(inode)) + if (iov_iter_rw(iter) == WRITE) { + endio = xfs_end_io_direct_write; + flags = DIO_ASYNC_EXTEND; + } + + if (IS_DAX(inode)) { return dax_do_io(iocb, inode, iter, offset, xfs_get_blocks_direct, endio, 0); + } bdev = xfs_find_bdev_for_inode(inode); return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, endio, NULL, flags); -} - -STATIC ssize_t -xfs_vm_direct_IO( - struct kiocb *iocb, - struct iov_iter *iter, - loff_t offset) -{ - struct inode *inode = iocb->ki_filp->f_mapping->host; - - if (iov_iter_rw(iter) == WRITE) - return xfs_vm_do_dio(inode, iocb, iter, offset, - xfs_end_io_direct_write, DIO_ASYNC_EXTEND); - return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); + xfs_get_blocks_direct, endio, NULL, flags); } /* @@ -1756,6 +1478,7 @@ xfs_vm_write_failed( loff_t from = pos & (PAGE_CACHE_SIZE - 1); loff_t to = from + len; struct buffer_head *bh, *head; + struct xfs_mount *mp = XFS_I(inode)->i_mount; /* * The request pos offset might be 32 or 64 bit, this is all fine @@ -1787,14 +1510,23 @@ xfs_vm_write_failed( if (block_start >= to) break; - if (!buffer_delay(bh)) + /* + * Process delalloc and unwritten buffers beyond EOF. We can + * encounter unwritten buffers in the event that a file has + * post-EOF unwritten extents and an extending write happens to + * fail (e.g., an unaligned write that also involves a delalloc + * to the same page). + */ + if (!buffer_delay(bh) && !buffer_unwritten(bh)) continue; - if (!buffer_new(bh) && block_offset < i_size_read(inode)) + if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && + block_offset < i_size_read(inode)) continue; - xfs_vm_kill_delalloc_range(inode, block_offset, - block_offset + bh->b_size); + if (buffer_delay(bh)) + xfs_vm_kill_delalloc_range(inode, block_offset, + block_offset + bh->b_size); /* * This buffer does not contain data anymore. make sure anyone @@ -1805,6 +1537,7 @@ xfs_vm_write_failed( clear_buffer_mapped(bh); clear_buffer_new(bh); clear_buffer_dirty(bh); + clear_buffer_unwritten(bh); } } @@ -1828,6 +1561,7 @@ xfs_vm_write_begin( pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct page *page; int status; + struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; ASSERT(len <= PAGE_CACHE_SIZE); @@ -1836,6 +1570,8 @@ xfs_vm_write_begin( return -ENOMEM; status = __block_write_begin(page, pos, len, xfs_get_blocks); + if (xfs_mp_fail_writes(mp)) + status = -EIO; if (unlikely(status)) { struct inode *inode = mapping->host; size_t isize = i_size_read(inode); @@ -1848,6 +1584,8 @@ xfs_vm_write_begin( * allocated in this write, not blocks that were previously * written successfully. */ + if (xfs_mp_fail_writes(mp)) + isize = 0; if (pos + len > isize) { ssize_t start = max_t(ssize_t, pos, isize); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index a4343c63fb38..b4421177b68d 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -24,12 +24,14 @@ extern mempool_t *xfs_ioend_pool; * Types of I/O for bmap clustering and I/O completion tracking. */ enum { + XFS_IO_INVALID, /* initial state */ XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ }; #define XFS_IO_TYPES \ + { XFS_IO_INVALID, "invalid" }, \ { XFS_IO_DELALLOC, "delalloc" }, \ { XFS_IO_UNWRITTEN, "unwritten" }, \ { XFS_IO_OVERWRITE, "overwrite" } @@ -39,7 +41,7 @@ enum { * It can manage several multi-page bio's at once. */ typedef struct xfs_ioend { - struct xfs_ioend *io_list; /* next ioend in chain */ + struct list_head io_list; /* next ioend in chain */ unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 0ef7c2ed3f8a..4fa14820e2e2 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -202,8 +202,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sbp->namelen, sbp->valuelen, &sbp->name[sbp->namelen]); - if (error) + if (error) { + kmem_free(sbuf); return error; + } if (context->seen_enough) break; cursor->offset++; @@ -454,14 +456,13 @@ xfs_attr3_leaf_list_int( args.rmtblkcnt = xfs_attr3_rmt_blocks( args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); - if (retval) - return retval; - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - args.value); + if (!retval) + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + args.value); kmem_free(args.value); } else { retval = context->put_listent(context, diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 6c876012b2e5..a32c1dcae2ff 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -203,10 +203,12 @@ xfs_bmap_rtalloc( ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; /* - * Lock out other modifications to the RT bitmap inode. + * Lock out modifications to both the RT bitmap and summary inodes */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* * If it's an allocation to an empty file at offset 0, @@ -822,7 +824,7 @@ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) { /* prealloc/delalloc exists only on regular files */ - if (!S_ISREG(ip->i_d.di_mode)) + if (!S_ISREG(VFS_I(ip)->i_mode)) return false; /* @@ -1727,7 +1729,7 @@ xfs_swap_extents( xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ - if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { + if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { error = -EINVAL; goto out_unlock; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 435c7de42e5f..9a2191b91137 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -650,7 +650,7 @@ xfs_buf_read_map( if (bp) { trace_xfs_buf_read(bp, flags, _RET_IP_); - if (!XFS_BUF_ISDONE(bp)) { + if (!(bp->b_flags & XBF_DONE)) { XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; _xfs_buf_read(bp, flags); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c75721acd867..4eb89bd4ee73 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -302,6 +302,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, /* Buffer Utility Routines */ extern void *xfs_buf_offset(struct xfs_buf *, size_t); +extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); @@ -312,31 +313,6 @@ extern int xfs_buf_delwri_submit_nowait(struct list_head *); extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); -#define XFS_BUF_ZEROFLAGS(bp) \ - ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ - XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \ - XBF_WRITE_FAIL)) - -void xfs_buf_stale(struct xfs_buf *bp); -#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) -#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) - -#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) -#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) -#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) - -#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) -#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) -#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) - -#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) -#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) -#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) - -#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE) -#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) -#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) - /* * These macros use the IO block map rather than b_bn. b_bn is now really * just for the buffer cache index for cached buffers. As IO does not use b_bn diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 7e986da34f6c..99e91a0e554e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -431,7 +431,7 @@ xfs_buf_item_unpin( if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); - ASSERT(XFS_BUF_ISSTALE(bp)); + ASSERT(bp->b_flags & XBF_STALE); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); @@ -493,7 +493,7 @@ xfs_buf_item_unpin( xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; xfs_buf_ioerror(bp, -EIO); - XFS_BUF_UNDONE(bp); + bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); xfs_buf_ioend(bp); } @@ -1067,7 +1067,7 @@ xfs_buf_iodone_callbacks( */ if (XFS_FORCED_SHUTDOWN(mp)) { xfs_buf_stale(bp); - XFS_BUF_DONE(bp); + bp->b_flags |= XBF_DONE; trace_xfs_buf_item_iodone(bp, _RET_IP_); goto do_callbacks; } @@ -1090,7 +1090,7 @@ xfs_buf_iodone_callbacks( * errors tend to affect the whole device and a failing log write * will make us give up. But we really ought to do better here. */ - if (XFS_BUF_ISASYNC(bp)) { + if (bp->b_flags & XBF_ASYNC) { ASSERT(bp->b_iodone != NULL); trace_xfs_buf_item_iodone_async(bp, _RET_IP_); @@ -1113,7 +1113,7 @@ xfs_buf_iodone_callbacks( * sure to return the error to the caller of xfs_bwrite(). */ xfs_buf_stale(bp); - XFS_BUF_DONE(bp); + bp->b_flags |= XBF_DONE; trace_xfs_buf_error_relse(bp, _RET_IP_); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 642d55d10075..93b3ab0c5435 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -665,7 +665,7 @@ xfs_readdir( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - ASSERT(S_ISDIR(dp->i_d.di_mode)); + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index e85a9519a5ae..272c3f8b6f7d 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -227,7 +227,7 @@ xfs_discard_extents( GFP_NOFS, 0); if (error && error != -EOPNOTSUPP) { xfs_info(mp, - "discard failed for extent [0x%llu,%u], error %d", + "discard failed for extent [0x%llx,%u], error %d", (unsigned long long)busyp->bno, busyp->length, error); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9c44d38dcd1f..316b2a1bdba5 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -92,26 +92,28 @@ xfs_qm_adjust_dqlimits( { struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_disk_dquot *d = &dq->q_core; + struct xfs_def_quota *defq; int prealloc = 0; ASSERT(d->d_id); + defq = xfs_get_defquota(dq, q); - if (q->qi_bsoftlimit && !d->d_blk_softlimit) { - d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); + if (defq->bsoftlimit && !d->d_blk_softlimit) { + d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit); prealloc = 1; } - if (q->qi_bhardlimit && !d->d_blk_hardlimit) { - d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); + if (defq->bhardlimit && !d->d_blk_hardlimit) { + d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit); prealloc = 1; } - if (q->qi_isoftlimit && !d->d_ino_softlimit) - d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); - if (q->qi_ihardlimit && !d->d_ino_hardlimit) - d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit); - if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit) - d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); - if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) - d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); + if (defq->isoftlimit && !d->d_ino_softlimit) + d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit); + if (defq->ihardlimit && !d->d_ino_hardlimit) + d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit); + if (defq->rtbsoftlimit && !d->d_rtb_softlimit) + d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit); + if (defq->rtbhardlimit && !d->d_rtb_hardlimit) + d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit); if (prealloc) xfs_dquot_set_prealloc_limits(dq); @@ -232,7 +234,8 @@ xfs_qm_init_dquot_blk( { struct xfs_quotainfo *q = mp->m_quotainfo; xfs_dqblk_t *d; - int curid, i; + xfs_dqid_t curid; + int i; ASSERT(tp); ASSERT(xfs_buf_islocked(bp)); @@ -243,7 +246,6 @@ xfs_qm_init_dquot_blk( * ID of the first dquot in the block - id's are zero based. */ curid = id - (id % q->qi_dqperchunk); - ASSERT(curid >= 0); memset(d, 0, BBTOB(q->qi_dqchunklen)); for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); @@ -464,12 +466,13 @@ xfs_qm_dqtobp( struct xfs_bmbt_irec map; int nmaps = 1, error; struct xfs_buf *bp; - struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp); + struct xfs_inode *quotip; struct xfs_mount *mp = dqp->q_mount; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); struct xfs_trans *tp = (tpp ? *tpp : NULL); uint lock_mode; + quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags); dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; lock_mode = xfs_ilock_data_map_shared(quotip); @@ -685,6 +688,56 @@ error0: } /* + * Advance to the next id in the current chunk, or if at the + * end of the chunk, skip ahead to first id in next allocated chunk + * using the SEEK_DATA interface. + */ +int +xfs_dq_get_next_id( + xfs_mount_t *mp, + uint type, + xfs_dqid_t *id, + loff_t eof) +{ + struct xfs_inode *quotip; + xfs_fsblock_t start; + loff_t offset; + uint lock; + xfs_dqid_t next_id; + int error = 0; + + /* Simple advance */ + next_id = *id + 1; + + /* If new ID is within the current chunk, advancing it sufficed */ + if (next_id % mp->m_quotainfo->qi_dqperchunk) { + *id = next_id; + return 0; + } + + /* Nope, next_id is now past the current chunk, so find the next one */ + start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk; + + quotip = xfs_quota_inode(mp, type); + lock = xfs_ilock_data_map_shared(quotip); + + offset = __xfs_seek_hole_data(VFS_I(quotip), XFS_FSB_TO_B(mp, start), + eof, SEEK_DATA); + if (offset < 0) + error = offset; + + xfs_iunlock(quotip, lock); + + /* -ENXIO is essentially "no more data" */ + if (error) + return (error == -ENXIO ? -ENOENT: error); + + /* Convert next data offset back to a quota id */ + *id = XFS_B_TO_FSB(mp, offset) * mp->m_quotainfo->qi_dqperchunk; + return 0; +} + +/* * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a * a locked dquot, doing an allocation (if requested) as needed. * When both an inode and an id are given, the inode's id takes precedence. @@ -704,6 +757,7 @@ xfs_qm_dqget( struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; + loff_t eof = 0; int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -731,6 +785,21 @@ xfs_qm_dqget( } #endif + /* Get the end of the quota file if we need it */ + if (flags & XFS_QMOPT_DQNEXT) { + struct xfs_inode *quotip; + xfs_fileoff_t last; + uint lock_mode; + + quotip = xfs_quota_inode(mp, type); + lock_mode = xfs_ilock_data_map_shared(quotip); + error = xfs_bmap_last_offset(quotip, &last, XFS_DATA_FORK); + xfs_iunlock(quotip, lock_mode); + if (error) + return error; + eof = XFS_FSB_TO_B(mp, last); + } + restart: mutex_lock(&qi->qi_tree_lock); dqp = radix_tree_lookup(tree, id); @@ -744,6 +813,18 @@ restart: goto restart; } + /* uninit / unused quota found in radix tree, keep looking */ + if (flags & XFS_QMOPT_DQNEXT) { + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + xfs_dqunlock(dqp); + mutex_unlock(&qi->qi_tree_lock); + error = xfs_dq_get_next_id(mp, type, &id, eof); + if (error) + return error; + goto restart; + } + } + dqp->q_nrefs++; mutex_unlock(&qi->qi_tree_lock); @@ -770,6 +851,13 @@ restart: if (ip) xfs_ilock(ip, XFS_ILOCK_EXCL); + /* If we are asked to find next active id, keep looking */ + if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) { + error = xfs_dq_get_next_id(mp, type, &id, eof); + if (!error) + goto restart; + } + if (error) return error; @@ -820,6 +908,17 @@ restart: qi->qi_dquots++; mutex_unlock(&qi->qi_tree_lock); + /* If we are asked to find next active id, keep looking */ + if (flags & XFS_QMOPT_DQNEXT) { + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + xfs_qm_dqput(dqp); + error = xfs_dq_get_next_id(mp, type, &id, eof); + if (error) + return error; + goto restart; + } + } + dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); trace_xfs_dqget_miss(dqp); diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 652cd3c5b58c..2816d42507bc 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -152,7 +152,7 @@ xfs_nfs_get_inode( return ERR_PTR(error); } - if (ip->i_d.di_gen != generation) { + if (VFS_I(ip)->i_generation != generation) { IRELE(ip); return ERR_PTR(-ESTALE); } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 52883ac3cf84..ac0fd32de31e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -156,9 +156,9 @@ xfs_update_prealloc_flags( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); if (!(flags & XFS_PREALLOC_INVISIBLE)) { - ip->i_d.di_mode &= ~S_ISUID; - if (ip->i_d.di_mode & S_IXGRP) - ip->i_d.di_mode &= ~S_ISGID; + VFS_I(ip)->i_mode &= ~S_ISUID; + if (VFS_I(ip)->i_mode & S_IXGRP) + VFS_I(ip)->i_mode &= ~S_ISGID; xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); } @@ -1337,31 +1337,31 @@ out: return found; } -STATIC loff_t -xfs_seek_hole_data( - struct file *file, +/* + * caller must lock inode with xfs_ilock_data_map_shared, + * can we craft an appropriate ASSERT? + * + * end is because the VFS-level lseek interface is defined such that any + * offset past i_size shall return -ENXIO, but we use this for quota code + * which does not maintain i_size, and we want to SEEK_DATA past i_size. + */ +loff_t +__xfs_seek_hole_data( + struct inode *inode, loff_t start, + loff_t end, int whence) { - struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; loff_t uninitialized_var(offset); - xfs_fsize_t isize; xfs_fileoff_t fsbno; - xfs_filblks_t end; - uint lock; + xfs_filblks_t lastbno; int error; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - lock = xfs_ilock_data_map_shared(ip); - - isize = i_size_read(inode); - if (start >= isize) { + if (start >= end) { error = -ENXIO; - goto out_unlock; + goto out_error; } /* @@ -1369,22 +1369,22 @@ xfs_seek_hole_data( * by fsbno to the end block of the file. */ fsbno = XFS_B_TO_FSBT(mp, start); - end = XFS_B_TO_FSB(mp, isize); + lastbno = XFS_B_TO_FSB(mp, end); for (;;) { struct xfs_bmbt_irec map[2]; int nmap = 2; unsigned int i; - error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap, XFS_BMAPI_ENTIRE); if (error) - goto out_unlock; + goto out_error; /* No extents at given offset, must be beyond EOF */ if (nmap == 0) { error = -ENXIO; - goto out_unlock; + goto out_error; } for (i = 0; i < nmap; i++) { @@ -1426,7 +1426,7 @@ xfs_seek_hole_data( * hole at the end of any file). */ if (whence == SEEK_HOLE) { - offset = isize; + offset = end; break; } /* @@ -1434,7 +1434,7 @@ xfs_seek_hole_data( */ ASSERT(whence == SEEK_DATA); error = -ENXIO; - goto out_unlock; + goto out_error; } ASSERT(i > 1); @@ -1445,14 +1445,14 @@ xfs_seek_hole_data( */ fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; start = XFS_FSB_TO_B(mp, fsbno); - if (start >= isize) { + if (start >= end) { if (whence == SEEK_HOLE) { - offset = isize; + offset = end; break; } ASSERT(whence == SEEK_DATA); error = -ENXIO; - goto out_unlock; + goto out_error; } } @@ -1464,7 +1464,39 @@ out: * situation in particular. */ if (whence == SEEK_HOLE) - offset = min_t(loff_t, offset, isize); + offset = min_t(loff_t, offset, end); + + return offset; + +out_error: + return error; +} + +STATIC loff_t +xfs_seek_hole_data( + struct file *file, + loff_t start, + int whence) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + uint lock; + loff_t offset, end; + int error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lock = xfs_ilock_data_map_shared(ip); + + end = i_size_read(inode); + offset = __xfs_seek_hole_data(inode, start, end, whence); + if (offset < 0) { + error = offset; + goto out_unlock; + } + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index c4c130f9bfb6..a51353a1f87f 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -151,7 +151,7 @@ xfs_filestream_pick_ag( xfs_agnumber_t ag, max_ag = NULLAGNUMBER; int err, trylock, nscan; - ASSERT(S_ISDIR(ip->i_d.di_mode)); + ASSERT(S_ISDIR(VFS_I(ip)->i_mode)); /* 2% of an AG's blocks must be free for it to be chosen. */ minfree = mp->m_sb.sb_agblocks / 50; @@ -319,7 +319,7 @@ xfs_filestream_lookup_ag( xfs_agnumber_t startag, ag = NULLAGNUMBER; struct xfs_mru_cache_elem *mru; - ASSERT(S_ISREG(ip->i_d.di_mode)); + ASSERT(S_ISREG(VFS_I(ip)->i_mode)); pip = xfs_filestream_get_parent(ip); if (!pip) diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 1b6a98b66886..f32713f14f9a 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,5 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern int xfs_fs_log_dummy(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d7a490f24ead..bf2d60749278 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -63,6 +63,9 @@ xfs_inode_alloc( return NULL; } + /* VFS doesn't initialise i_mode! */ + VFS_I(ip)->i_mode = 0; + XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); @@ -79,7 +82,7 @@ xfs_inode_alloc( memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); ip->i_flags = 0; ip->i_delayed_blks = 0; - memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + memset(&ip->i_d, 0, sizeof(ip->i_d)); return ip; } @@ -98,7 +101,7 @@ void xfs_inode_free( struct xfs_inode *ip) { - switch (ip->i_d.di_mode & S_IFMT) { + switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFREG: case S_IFDIR: case S_IFLNK: @@ -135,6 +138,34 @@ xfs_inode_free( } /* + * When we recycle a reclaimable inode, we need to re-initialise the VFS inode + * part of the structure. This is made more complex by the fact we store + * information about the on-disk values in the VFS inode and so we can't just + * overwrite the values unconditionally. Hence we save the parameters we + * need to retain across reinitialisation, and rewrite them into the VFS inode + * after reinitialisation even if it fails. + */ +static int +xfs_reinit_inode( + struct xfs_mount *mp, + struct inode *inode) +{ + int error; + uint32_t nlink = inode->i_nlink; + uint32_t generation = inode->i_generation; + uint64_t version = inode->i_version; + umode_t mode = inode->i_mode; + + error = inode_init_always(mp->m_super, inode); + + set_nlink(inode, nlink); + inode->i_generation = generation; + inode->i_version = version; + inode->i_mode = mode; + return error; +} + +/* * Check the validity of the inode we just found it the cache */ static int @@ -185,7 +216,7 @@ xfs_iget_cache_hit( /* * If lookup is racing with unlink return an error immediately. */ - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) { error = -ENOENT; goto out_error; } @@ -208,7 +239,7 @@ xfs_iget_cache_hit( spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); - error = inode_init_always(mp->m_super, inode); + error = xfs_reinit_inode(mp, inode); if (error) { /* * Re-initializing the inode failed, and we are in deep @@ -295,7 +326,7 @@ xfs_iget_cache_miss( trace_xfs_iget_miss(ip); - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) { error = -ENOENT; goto out_destroy; } @@ -444,7 +475,7 @@ again: * If we have a real type for an on-disk inode, we can setup the inode * now. If it's a new inode being created, xfs_ialloc will handle it. */ - if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) xfs_setup_existing_inode(ip); return 0; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ceba1a83cacc..96f606deee31 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -57,9 +57,9 @@ kmem_zone_t *xfs_inode_zone; */ #define XFS_ITRUNC_MAX_EXTENTS 2 -STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); - -STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *); +STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *); +STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); +STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); /* * helper function to extract extent size hint from inode @@ -766,6 +766,7 @@ xfs_ialloc( uint flags; int error; struct timespec tv; + struct inode *inode; /* * Call the space management code to pick @@ -791,6 +792,7 @@ xfs_ialloc( if (error) return error; ASSERT(ip != NULL); + inode = VFS_I(ip); /* * We always convert v1 inodes to v2 now - we only support filesystems @@ -800,20 +802,16 @@ xfs_ialloc( if (ip->i_d.di_version == 1) ip->i_d.di_version = 2; - ip->i_d.di_mode = mode; - ip->i_d.di_onlink = 0; - ip->i_d.di_nlink = nlink; - ASSERT(ip->i_d.di_nlink == nlink); + inode->i_mode = mode; + set_nlink(inode, nlink); ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); xfs_set_projid(ip, prid); - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; - if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { - ip->i_d.di_mode |= S_ISGID; - } + if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) + inode->i_mode |= S_ISGID; } /* @@ -822,38 +820,29 @@ xfs_ialloc( * (and only if the irix_sgid_inherit compatibility variable is set). */ if ((irix_sgid_inherit) && - (ip->i_d.di_mode & S_ISGID) && - (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) { - ip->i_d.di_mode &= ~S_ISGID; - } + (inode->i_mode & S_ISGID) && + (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) + inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); tv = current_fs_time(mp->m_super); - ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; - ip->i_d.di_atime = ip->i_d.di_mtime; - ip->i_d.di_ctime = ip->i_d.di_mtime; + inode->i_mtime = tv; + inode->i_atime = tv; + inode->i_ctime = tv; - /* - * di_gen will have been taken care of in xfs_iread. - */ ip->i_d.di_extsize = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; if (ip->i_d.di_version == 3) { - ASSERT(ip->i_d.di_ino == ino); - ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid)); - ip->i_d.di_crc = 0; - ip->i_d.di_changecount = 1; - ip->i_d.di_lsn = 0; + inode->i_version = 1; ip->i_d.di_flags2 = 0; - memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); - ip->i_d.di_crtime = ip->i_d.di_mtime; + ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec; + ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec; } @@ -1092,35 +1081,24 @@ xfs_dir_ialloc( } /* - * Decrement the link count on an inode & log the change. - * If this causes the link count to go to zero, initiate the - * logging activity required to truncate a file. + * Decrement the link count on an inode & log the change. If this causes the + * link count to go to zero, move the inode to AGI unlinked list so that it can + * be freed when the last active reference goes away via xfs_inactive(). */ int /* error */ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) { - int error; - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - ASSERT (ip->i_d.di_nlink > 0); - ip->i_d.di_nlink--; drop_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = 0; - if (ip->i_d.di_nlink == 0) { - /* - * We're dropping the last link to this file. - * Move the on-disk inode to the AGI unlinked list. - * From xfs_inactive() we will pull the inode from - * the list and free it. - */ - error = xfs_iunlink(tp, ip); - } - return error; + if (VFS_I(ip)->i_nlink) + return 0; + + return xfs_iunlink(tp, ip); } /* @@ -1134,8 +1112,6 @@ xfs_bumplink( xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); ASSERT(ip->i_d.di_version > 1); - ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE)); - ip->i_d.di_nlink++; inc_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); return 0; @@ -1393,7 +1369,6 @@ xfs_create_tmpfile( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - ip->i_d.di_nlink--; error = xfs_iunlink(tp, ip); if (error) goto out_trans_cancel; @@ -1444,7 +1419,7 @@ xfs_link( trace_xfs_link(tdp, target_name); - ASSERT(!S_ISDIR(sip->i_d.di_mode)); + ASSERT(!S_ISDIR(VFS_I(sip)->i_mode)); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1492,7 +1467,10 @@ xfs_link( xfs_bmap_init(&free_list, &first_block); - if (sip->i_d.di_nlink == 0) { + /* + * Handle initial link state of O_TMPFILE inode + */ + if (VFS_I(sip)->i_nlink == 0) { error = xfs_iunlink_remove(tp, sip); if (error) goto error_return; @@ -1648,7 +1626,7 @@ xfs_release( xfs_mount_t *mp = ip->i_mount; int error; - if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) + if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0)) return 0; /* If this is a read-only mount, don't do this (would generate I/O) */ @@ -1679,7 +1657,7 @@ xfs_release( } } - if (ip->i_d.di_nlink == 0) + if (VFS_I(ip)->i_nlink == 0) return 0; if (xfs_can_free_eofblocks(ip, false)) { @@ -1883,7 +1861,7 @@ xfs_inactive( * If the inode is already free, then there can be nothing * to clean up here. */ - if (ip->i_d.di_mode == 0) { + if (VFS_I(ip)->i_mode == 0) { ASSERT(ip->i_df.if_real_bytes == 0); ASSERT(ip->i_df.if_broot_bytes == 0); return; @@ -1895,7 +1873,7 @@ xfs_inactive( if (mp->m_flags & XFS_MOUNT_RDONLY) return; - if (ip->i_d.di_nlink != 0) { + if (VFS_I(ip)->i_nlink != 0) { /* * force is true because we are evicting an inode from the * cache. Post-eof blocks must be freed, lest we end up with @@ -1907,7 +1885,7 @@ xfs_inactive( return; } - if (S_ISREG(ip->i_d.di_mode) && + if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; @@ -1916,7 +1894,7 @@ xfs_inactive( if (error) return; - if (S_ISLNK(ip->i_d.di_mode)) + if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); else if (truncate) error = xfs_inactive_truncate(ip); @@ -1952,16 +1930,21 @@ xfs_inactive( } /* - * This is called when the inode's link count goes to 0. - * We place the on-disk inode on a list in the AGI. It - * will be pulled from this list when the inode is freed. + * This is called when the inode's link count goes to 0 or we are creating a + * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be + * set to true as the link count is dropped to zero by the VFS after we've + * created the file successfully, so we have to add it to the unlinked list + * while the link count is non-zero. + * + * We place the on-disk inode on a list in the AGI. It will be pulled from this + * list when the inode is freed. */ -int +STATIC int xfs_iunlink( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_mount_t *mp; + xfs_mount_t *mp = tp->t_mountp; xfs_agi_t *agi; xfs_dinode_t *dip; xfs_buf_t *agibp; @@ -1971,10 +1954,7 @@ xfs_iunlink( int offset; int error; - ASSERT(ip->i_d.di_nlink == 0); - ASSERT(ip->i_d.di_mode != 0); - - mp = tp->t_mountp; + ASSERT(VFS_I(ip)->i_mode != 0); /* * Get the agi buffer first. It ensures lock ordering @@ -2412,10 +2392,10 @@ xfs_ifree( struct xfs_icluster xic = { 0 }; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_d.di_nlink == 0); + ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); - ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); + ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -2429,7 +2409,7 @@ xfs_ifree( if (error) return error; - ip->i_d.di_mode = 0; /* mark incore inode as free */ + VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ @@ -2439,7 +2419,7 @@ xfs_ifree( * Bump the generation count so no one will be confused * by reincarnations of this inode. */ - ip->i_d.di_gen++; + VFS_I(ip)->i_generation++; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (xic.deleted) @@ -2526,7 +2506,7 @@ xfs_remove( { xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp = NULL; - int is_dir = S_ISDIR(ip->i_d.di_mode); + int is_dir = S_ISDIR(VFS_I(ip)->i_mode); int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; @@ -2580,8 +2560,8 @@ xfs_remove( * If we're removing a directory perform some additional validation. */ if (is_dir) { - ASSERT(ip->i_d.di_nlink >= 2); - if (ip->i_d.di_nlink != 2) { + ASSERT(VFS_I(ip)->i_nlink >= 2); + if (VFS_I(ip)->i_nlink != 2) { error = -ENOTEMPTY; goto out_trans_cancel; } @@ -2771,7 +2751,7 @@ xfs_cross_rename( if (dp1 != dp2) { dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; - if (S_ISDIR(ip2->i_d.di_mode)) { + if (S_ISDIR(VFS_I(ip2)->i_mode)) { error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, dp1->i_ino, first_block, free_list, spaceres); @@ -2779,7 +2759,7 @@ xfs_cross_rename( goto out_trans_abort; /* transfer ip2 ".." reference to dp1 */ - if (!S_ISDIR(ip1->i_d.di_mode)) { + if (!S_ISDIR(VFS_I(ip1)->i_mode)) { error = xfs_droplink(tp, dp2); if (error) goto out_trans_abort; @@ -2798,7 +2778,7 @@ xfs_cross_rename( ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; } - if (S_ISDIR(ip1->i_d.di_mode)) { + if (S_ISDIR(VFS_I(ip1)->i_mode)) { error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, dp2->i_ino, first_block, free_list, spaceres); @@ -2806,7 +2786,7 @@ xfs_cross_rename( goto out_trans_abort; /* transfer ip1 ".." reference to dp2 */ - if (!S_ISDIR(ip2->i_d.di_mode)) { + if (!S_ISDIR(VFS_I(ip2)->i_mode)) { error = xfs_droplink(tp, dp1); if (error) goto out_trans_abort; @@ -2903,7 +2883,7 @@ xfs_rename( struct xfs_inode *inodes[__XFS_SORT_INODES]; int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); - bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); int spaceres; int error; @@ -3032,12 +3012,12 @@ xfs_rename( * target and source are directories and that target can be * destroyed, or that neither is a directory. */ - if (S_ISDIR(target_ip->i_d.di_mode)) { + if (S_ISDIR(VFS_I(target_ip)->i_mode)) { /* * Make sure target dir is empty. */ if (!(xfs_dir_isempty(target_ip)) || - (target_ip->i_d.di_nlink > 2)) { + (VFS_I(target_ip)->i_nlink > 2)) { error = -EEXIST; goto out_trans_cancel; } @@ -3144,7 +3124,7 @@ xfs_rename( * intermediate state on disk. */ if (wip) { - ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); + ASSERT(VFS_I(wip)->i_nlink == 0); error = xfs_bumplink(tp, wip); if (error) goto out_bmap_cancel; @@ -3313,7 +3293,7 @@ cluster_corrupt_out: * mark it as stale and brelse. */ if (bp->b_iodone) { - XFS_BUF_UNDONE(bp); + bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); xfs_buf_ioerror(bp, -EIO); xfs_buf_ioend(bp); @@ -3462,14 +3442,7 @@ xfs_iflush_int( __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto corrupt_out; } - if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, - mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { - xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x", - __func__, ip->i_ino, ip, ip->i_d.di_magic); - goto corrupt_out; - } - if (S_ISREG(ip->i_d.di_mode)) { + if (S_ISREG(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), @@ -3479,7 +3452,7 @@ xfs_iflush_int( __func__, ip->i_ino, ip); goto corrupt_out; } - } else if (S_ISDIR(ip->i_d.di_mode)) { + } else if (S_ISDIR(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && @@ -3523,12 +3496,11 @@ xfs_iflush_int( ip->i_d.di_flushiter++; /* - * Copy the dirty parts of the inode into the on-disk - * inode. We always copy out the core of the inode, - * because if the inode is dirty at all the core must - * be. + * Copy the dirty parts of the inode into the on-disk inode. We always + * copy out the core of the inode, because if the inode is dirty at all + * the core must be. */ - xfs_dinode_to_disk(dip, &ip->i_d); + xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); /* Wrap, we never let the log put out DI_MAX_FLUSH */ if (ip->i_d.di_flushiter == DI_MAX_FLUSH) @@ -3580,10 +3552,6 @@ xfs_iflush_int( */ xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); - /* update the lsn in the on disk inode if required */ - if (ip->i_d.di_version == 3) - dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn); - /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ca9e11989cbd..43e1d51b15eb 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -63,7 +63,7 @@ typedef struct xfs_inode { unsigned long i_flags; /* see defined flags below */ unsigned int i_delayed_blks; /* count of delay alloc blks */ - xfs_icdinode_t i_d; /* most of ondisk inode */ + struct xfs_icdinode i_d; /* most of ondisk inode */ /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ @@ -88,7 +88,7 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) */ static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) { - if (S_ISREG(ip->i_d.di_mode)) + if (S_ISREG(VFS_I(ip)->i_mode)) return i_size_read(VFS_I(ip)); return ip->i_d.di_size; } @@ -369,7 +369,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) */ #define XFS_INHERIT_GID(pip) \ (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ - ((pip)->i_d.di_mode & S_ISGID)) + (VFS_I(pip)->i_mode & S_ISGID)) int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); @@ -405,8 +405,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t); -int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); - void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); @@ -437,6 +435,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip, int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, xfs_fsize_t isize, bool *did_zeroing); int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, + loff_t eof, int whence); /* from xfs_iops.c */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d14b12b8cfef..c48b5b18d771 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -135,7 +135,7 @@ xfs_inode_item_size( *nvecs += 2; *nbytes += sizeof(struct xfs_inode_log_format) + - xfs_icdinode_size(ip->i_d.di_version); + xfs_log_dinode_size(ip->i_d.di_version); xfs_inode_item_data_fork_size(iip, nvecs, nbytes); if (XFS_IFORK_Q(ip)) @@ -322,6 +322,81 @@ xfs_inode_item_format_attr_fork( } } +static void +xfs_inode_to_log_dinode( + struct xfs_inode *ip, + struct xfs_log_dinode *to, + xfs_lsn_t lsn) +{ + struct xfs_icdinode *from = &ip->i_d; + struct inode *inode = VFS_I(ip); + + to->di_magic = XFS_DINODE_MAGIC; + + to->di_version = from->di_version; + to->di_format = from->di_format; + to->di_uid = from->di_uid; + to->di_gid = from->di_gid; + to->di_projid_lo = from->di_projid_lo; + to->di_projid_hi = from->di_projid_hi; + + memset(to->di_pad, 0, sizeof(to->di_pad)); + memset(to->di_pad3, 0, sizeof(to->di_pad3)); + to->di_atime.t_sec = inode->i_atime.tv_sec; + to->di_atime.t_nsec = inode->i_atime.tv_nsec; + to->di_mtime.t_sec = inode->i_mtime.tv_sec; + to->di_mtime.t_nsec = inode->i_mtime.tv_nsec; + to->di_ctime.t_sec = inode->i_ctime.tv_sec; + to->di_ctime.t_nsec = inode->i_ctime.tv_nsec; + to->di_nlink = inode->i_nlink; + to->di_gen = inode->i_generation; + to->di_mode = inode->i_mode; + + to->di_size = from->di_size; + to->di_nblocks = from->di_nblocks; + to->di_extsize = from->di_extsize; + to->di_nextents = from->di_nextents; + to->di_anextents = from->di_anextents; + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = from->di_dmevmask; + to->di_dmstate = from->di_dmstate; + to->di_flags = from->di_flags; + + if (from->di_version == 3) { + to->di_changecount = inode->i_version; + to->di_crtime.t_sec = from->di_crtime.t_sec; + to->di_crtime.t_nsec = from->di_crtime.t_nsec; + to->di_flags2 = from->di_flags2; + + to->di_ino = ip->i_ino; + to->di_lsn = lsn; + memset(to->di_pad2, 0, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = from->di_flushiter; + } +} + +/* + * Format the inode core. Current timestamp data is only in the VFS inode + * fields, so we need to grab them from there. Hence rather than just copying + * the XFS inode core structure, format the fields directly into the iovec. + */ +static void +xfs_inode_item_format_core( + struct xfs_inode *ip, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_log_dinode *dic; + + dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE); + xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn); + xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version)); +} + /* * This is called to fill in the vector of log iovecs for the given inode * log item. It fills the first item with an inode log format structure, @@ -351,10 +426,7 @@ xfs_inode_item_format( ilf->ilf_size = 2; /* format + core */ xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE, - &ip->i_d, - xfs_icdinode_size(ip->i_d.di_version)); - + xfs_inode_item_format_core(ip, lv, &vecp); xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); if (XFS_IFORK_Q(ip)) { xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 478d04e07f95..bcb6c19ce3ea 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -114,7 +114,7 @@ xfs_find_handle( handle.ha_fid.fid_len = sizeof(xfs_fid_t) - sizeof(handle.ha_fid.fid_len); handle.ha_fid.fid_pad = 0; - handle.ha_fid.fid_gen = ip->i_d.di_gen; + handle.ha_fid.fid_gen = inode->i_generation; handle.ha_fid.fid_ino = ip->i_ino; hsize = XFS_HSIZE(handle); @@ -963,7 +963,7 @@ xfs_set_diflags( di_flags |= XFS_DIFLAG_NODEFRAG; if (xflags & FS_XFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; - if (S_ISDIR(ip->i_d.di_mode)) { + if (S_ISDIR(VFS_I(ip)->i_mode)) { if (xflags & FS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (xflags & FS_XFLAG_NOSYMLINKS) @@ -972,7 +972,7 @@ xfs_set_diflags( di_flags |= XFS_DIFLAG_EXTSZINHERIT; if (xflags & FS_XFLAG_PROJINHERIT) di_flags |= XFS_DIFLAG_PROJINHERIT; - } else if (S_ISREG(ip->i_d.di_mode)) { + } else if (S_ISREG(VFS_I(ip)->i_mode)) { if (xflags & FS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; if (xflags & FS_XFLAG_EXTSIZE) @@ -1060,23 +1060,86 @@ xfs_ioctl_setattr_xflags( } /* + * If we are changing DAX flags, we have to ensure the file is clean and any + * cached objects in the address space are invalidated and removed. This + * requires us to lock out other IO and page faults similar to a truncate + * operation. The locks need to be held until the transaction has been committed + * so that the cache invalidation is atomic with respect to the DAX flag + * manipulation. + */ +static int +xfs_ioctl_setattr_dax_invalidate( + struct xfs_inode *ip, + struct fsxattr *fa, + int *join_flags) +{ + struct inode *inode = VFS_I(ip); + int error; + + *join_flags = 0; + + /* + * It is only valid to set the DAX flag on regular files and + * directories on filesystems where the block size is equal to the page + * size. On directories it serves as an inherit hint. + */ + if (fa->fsx_xflags & FS_XFLAG_DAX) { + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EINVAL; + if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE) + return -EINVAL; + } + + /* If the DAX state is not changing, we have nothing to do here. */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode)) + return 0; + if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode)) + return 0; + + /* lock, flush and invalidate mapping in preparation for flag change */ + xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL); + error = filemap_write_and_wait(inode->i_mapping); + if (error) + goto out_unlock; + error = invalidate_inode_pages2(inode->i_mapping); + if (error) + goto out_unlock; + + *join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL; + return 0; + +out_unlock: + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL); + return error; + +} + +/* * Set up the transaction structure for the setattr operation, checking that we * have permission to do so. On success, return a clean transaction and the * inode locked exclusively ready for further operation specific checks. On * failure, return an error without modifying or locking the inode. + * + * The inode might already be IO locked on call. If this is the case, it is + * indicated in @join_flags and we take full responsibility for ensuring they + * are unlocked from now on. Hence if we have an error here, we still have to + * unlock them. Otherwise, once they are joined to the transaction, they will + * be unlocked on commit/cancel. */ static struct xfs_trans * xfs_ioctl_setattr_get_trans( - struct xfs_inode *ip) + struct xfs_inode *ip, + int join_flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - int error; + int error = -EROFS; if (mp->m_flags & XFS_MOUNT_RDONLY) - return ERR_PTR(-EROFS); + goto out_unlock; + error = -EIO; if (XFS_FORCED_SHUTDOWN(mp)) - return ERR_PTR(-EIO); + goto out_unlock; tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); @@ -1084,7 +1147,8 @@ xfs_ioctl_setattr_get_trans( goto out_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags); + join_flags = 0; /* * CAP_FOWNER overrides the following restrictions: @@ -1104,6 +1168,9 @@ xfs_ioctl_setattr_get_trans( out_cancel: xfs_trans_cancel(tp); +out_unlock: + if (join_flags) + xfs_iunlock(ip, join_flags); return ERR_PTR(error); } @@ -1128,14 +1195,14 @@ xfs_ioctl_setattr_check_extsize( { struct xfs_mount *mp = ip->i_mount; - if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) + if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode)) return -EINVAL; if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && - !S_ISDIR(ip->i_d.di_mode)) + !S_ISDIR(VFS_I(ip)->i_mode)) return -EINVAL; - if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents && + if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) return -EINVAL; @@ -1202,6 +1269,7 @@ xfs_ioctl_setattr( struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; int code; + int join_flags = 0; trace_xfs_ioctl_setattr(ip); @@ -1225,7 +1293,18 @@ xfs_ioctl_setattr( return code; } - tp = xfs_ioctl_setattr_get_trans(ip); + /* + * Changing DAX config may require inode locking for mapping + * invalidation. These need to be held all the way to transaction commit + * or cancel time, so need to be passed through to + * xfs_ioctl_setattr_get_trans() so it can apply them to the join call + * appropriately. + */ + code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags); + if (code) + goto error_free_dquots; + + tp = xfs_ioctl_setattr_get_trans(ip, join_flags); if (IS_ERR(tp)) { code = PTR_ERR(tp); goto error_free_dquots; @@ -1256,9 +1335,9 @@ xfs_ioctl_setattr( * successful return from chown() */ - if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) && !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) - ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ if (xfs_get_projid(ip) != fa->fsx_projid) { @@ -1341,6 +1420,7 @@ xfs_ioc_setxflags( struct xfs_trans *tp; struct fsxattr fa; unsigned int flags; + int join_flags = 0; int error; if (copy_from_user(&flags, arg, sizeof(flags))) @@ -1357,7 +1437,18 @@ xfs_ioc_setxflags( if (error) return error; - tp = xfs_ioctl_setattr_get_trans(ip); + /* + * Changing DAX config may require inode locking for mapping + * invalidation. These need to be held all the way to transaction commit + * or cancel time, so need to be passed through to + * xfs_ioctl_setattr_get_trans() so it can apply them to the join call + * appropriately. + */ + error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags); + if (error) + goto out_drop_write; + + tp = xfs_ioctl_setattr_get_trans(ip, join_flags); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto out_drop_write; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 76b71a1c6c32..fb7dc61f4a29 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -459,8 +459,8 @@ xfs_vn_getattr( stat->size = XFS_ISIZE(ip); stat->dev = inode->i_sb->s_dev; - stat->mode = ip->i_d.di_mode; - stat->nlink = ip->i_d.di_nlink; + stat->mode = inode->i_mode; + stat->nlink = inode->i_nlink; stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->ino = ip->i_ino; @@ -506,9 +506,6 @@ xfs_setattr_mode( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= mode & ~S_IFMT; - inode->i_mode &= S_IFMT; inode->i_mode |= mode & ~S_IFMT; } @@ -522,21 +519,12 @@ xfs_setattr_time( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (iattr->ia_valid & ATTR_ATIME) { + if (iattr->ia_valid & ATTR_ATIME) inode->i_atime = iattr->ia_atime; - ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; - ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - } - if (iattr->ia_valid & ATTR_CTIME) { + if (iattr->ia_valid & ATTR_CTIME) inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (iattr->ia_valid & ATTR_MTIME) { + if (iattr->ia_valid & ATTR_MTIME) inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - } } int @@ -661,9 +649,9 @@ xfs_setattr_nonsize( * The set-user-ID and set-group-ID bits of a file will be * cleared upon successful return from chown() */ - if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + if ((inode->i_mode & (S_ISUID|S_ISGID)) && !capable(CAP_FSETID)) - ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + inode->i_mode &= ~(S_ISUID|S_ISGID); /* * Change the ownerships and register quota modifications @@ -773,7 +761,7 @@ xfs_setattr_size( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); - ASSERT(S_ISREG(ip->i_d.di_mode)); + ASSERT(S_ISREG(inode->i_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); @@ -991,21 +979,13 @@ xfs_vn_update_time( } xfs_ilock(ip, XFS_ILOCK_EXCL); - if (flags & S_CTIME) { + if (flags & S_CTIME) inode->i_ctime = *now; - ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec; - } - if (flags & S_MTIME) { + if (flags & S_MTIME) inode->i_mtime = *now; - ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec; - } - if (flags & S_ATIME) { + if (flags & S_ATIME) inode->i_atime = *now; - ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec; - } + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); return xfs_trans_commit(tp); @@ -1205,8 +1185,10 @@ xfs_diflags_to_iflags( inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - if (ip->i_mount->m_flags & XFS_MOUNT_DAX || - ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + if (S_ISREG(inode->i_mode) && + ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && + (ip->i_mount->m_flags & XFS_MOUNT_DAX || + ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) inode->i_flags |= S_DAX; } @@ -1232,8 +1214,6 @@ xfs_setup_inode( /* make the inode look hashed for the writeback code */ hlist_add_fake(&inode->i_hash); - inode->i_mode = ip->i_d.di_mode; - set_nlink(inode, ip->i_d.di_nlink); inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); @@ -1249,14 +1229,7 @@ xfs_setup_inode( break; } - inode->i_generation = ip->i_d.di_gen; i_size_write(inode, ip->i_d.di_size); - inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; - inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; - inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; - inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; - inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; xfs_diflags_to_iflags(inode, ip); ip->d_ops = ip->i_mount->m_nondir_inode_ops; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 930ebd86beba..ce73eb34620d 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -57,6 +57,7 @@ xfs_bulkstat_one_int( { struct xfs_icdinode *dic; /* dinode core info pointer */ struct xfs_inode *ip; /* incore inode pointer */ + struct inode *inode; struct xfs_bstat *buf; /* return buffer */ int error = 0; /* error value */ @@ -77,30 +78,33 @@ xfs_bulkstat_one_int( ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); + inode = VFS_I(ip); dic = &ip->i_d; /* xfs_iget returns the following without needing * further change. */ - buf->bs_nlink = dic->di_nlink; buf->bs_projid_lo = dic->di_projid_lo; buf->bs_projid_hi = dic->di_projid_hi; buf->bs_ino = ino; - buf->bs_mode = dic->di_mode; buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; - buf->bs_atime.tv_sec = dic->di_atime.t_sec; - buf->bs_atime.tv_nsec = dic->di_atime.t_nsec; - buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; - buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; - buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; - buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; + + buf->bs_nlink = inode->i_nlink; + buf->bs_atime.tv_sec = inode->i_atime.tv_sec; + buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; + buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; + buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; + buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; + buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; + buf->bs_gen = inode->i_generation; + buf->bs_mode = inode->i_mode; + buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; buf->bs_extents = dic->di_nextents; - buf->bs_gen = dic->di_gen; memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); buf->bs_dmevmask = dic->di_dmevmask; buf->bs_dmstate = dic->di_dmstate; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 9c9a1c9bcc7f..b49ccf5c1d75 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1212,7 +1212,7 @@ xlog_iodone(xfs_buf_t *bp) } /* log I/O is always issued ASYNC */ - ASSERT(XFS_BUF_ISASYNC(bp)); + ASSERT(bp->b_flags & XBF_ASYNC); xlog_state_done_syncing(iclog, aborted); /* @@ -1864,9 +1864,8 @@ xlog_sync( bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; - XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_ASYNC(bp); - bp->b_flags |= XBF_SYNCIO; + bp->b_flags &= ~(XBF_FUA | XBF_FLUSH); + bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE); if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { bp->b_flags |= XBF_FUA; @@ -1893,12 +1892,11 @@ xlog_sync( /* account for log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); + /* * Don't call xfs_bwrite here. We do log-syncs even when the filesystem * is shutting down. */ - XFS_BUF_WRITE(bp); - error = xlog_bdstrat(bp); if (error) { xfs_buf_ioerror_alert(bp, "xlog_sync"); @@ -1910,9 +1908,8 @@ xlog_sync( xfs_buf_associate_memory(bp, (char *)&iclog->ic_header + count, split); bp->b_fspriv = iclog; - XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_ASYNC(bp); - bp->b_flags |= XBF_SYNCIO; + bp->b_flags &= ~(XBF_FUA | XBF_FLUSH); + bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE); if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; @@ -1921,7 +1918,6 @@ xlog_sync( /* account for internal log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); - XFS_BUF_WRITE(bp); error = xlog_bdstrat(bp); if (error) { xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); @@ -2012,77 +2008,81 @@ xlog_print_tic_res( uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); /* match with XLOG_REG_TYPE_* in xfs_log.h */ - static char *res_type_str[XLOG_REG_TYPE_MAX] = { - "bformat", - "bchunk", - "efi_format", - "efd_format", - "iformat", - "icore", - "iext", - "ibroot", - "ilocal", - "iattr_ext", - "iattr_broot", - "iattr_local", - "qformat", - "dquot", - "quotaoff", - "LR header", - "unmount", - "commit", - "trans header" +#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str + static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = { + REG_TYPE_STR(BFORMAT, "bformat"), + REG_TYPE_STR(BCHUNK, "bchunk"), + REG_TYPE_STR(EFI_FORMAT, "efi_format"), + REG_TYPE_STR(EFD_FORMAT, "efd_format"), + REG_TYPE_STR(IFORMAT, "iformat"), + REG_TYPE_STR(ICORE, "icore"), + REG_TYPE_STR(IEXT, "iext"), + REG_TYPE_STR(IBROOT, "ibroot"), + REG_TYPE_STR(ILOCAL, "ilocal"), + REG_TYPE_STR(IATTR_EXT, "iattr_ext"), + REG_TYPE_STR(IATTR_BROOT, "iattr_broot"), + REG_TYPE_STR(IATTR_LOCAL, "iattr_local"), + REG_TYPE_STR(QFORMAT, "qformat"), + REG_TYPE_STR(DQUOT, "dquot"), + REG_TYPE_STR(QUOTAOFF, "quotaoff"), + REG_TYPE_STR(LRHEADER, "LR header"), + REG_TYPE_STR(UNMOUNT, "unmount"), + REG_TYPE_STR(COMMIT, "commit"), + REG_TYPE_STR(TRANSHDR, "trans header"), + REG_TYPE_STR(ICREATE, "inode create") }; +#undef REG_TYPE_STR +#define TRANS_TYPE_STR(type) [XFS_TRANS_##type] = #type static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { - "SETATTR_NOT_SIZE", - "SETATTR_SIZE", - "INACTIVE", - "CREATE", - "CREATE_TRUNC", - "TRUNCATE_FILE", - "REMOVE", - "LINK", - "RENAME", - "MKDIR", - "RMDIR", - "SYMLINK", - "SET_DMATTRS", - "GROWFS", - "STRAT_WRITE", - "DIOSTRAT", - "WRITE_SYNC", - "WRITEID", - "ADDAFORK", - "ATTRINVAL", - "ATRUNCATE", - "ATTR_SET", - "ATTR_RM", - "ATTR_FLAG", - "CLEAR_AGI_BUCKET", - "QM_SBCHANGE", - "DUMMY1", - "DUMMY2", - "QM_QUOTAOFF", - "QM_DQALLOC", - "QM_SETQLIM", - "QM_DQCLUSTER", - "QM_QINOCREATE", - "QM_QUOTAOFF_END", - "FSYNC_TS", - "GROWFSRT_ALLOC", - "GROWFSRT_ZERO", - "GROWFSRT_FREE", - "SWAPEXT", - "CHECKPOINT", - "ICREATE", - "CREATE_TMPFILE" + TRANS_TYPE_STR(SETATTR_NOT_SIZE), + TRANS_TYPE_STR(SETATTR_SIZE), + TRANS_TYPE_STR(INACTIVE), + TRANS_TYPE_STR(CREATE), + TRANS_TYPE_STR(CREATE_TRUNC), + TRANS_TYPE_STR(TRUNCATE_FILE), + TRANS_TYPE_STR(REMOVE), + TRANS_TYPE_STR(LINK), + TRANS_TYPE_STR(RENAME), + TRANS_TYPE_STR(MKDIR), + TRANS_TYPE_STR(RMDIR), + TRANS_TYPE_STR(SYMLINK), + TRANS_TYPE_STR(SET_DMATTRS), + TRANS_TYPE_STR(GROWFS), + TRANS_TYPE_STR(STRAT_WRITE), + TRANS_TYPE_STR(DIOSTRAT), + TRANS_TYPE_STR(WRITEID), + TRANS_TYPE_STR(ADDAFORK), + TRANS_TYPE_STR(ATTRINVAL), + TRANS_TYPE_STR(ATRUNCATE), + TRANS_TYPE_STR(ATTR_SET), + TRANS_TYPE_STR(ATTR_RM), + TRANS_TYPE_STR(ATTR_FLAG), + TRANS_TYPE_STR(CLEAR_AGI_BUCKET), + TRANS_TYPE_STR(SB_CHANGE), + TRANS_TYPE_STR(DUMMY1), + TRANS_TYPE_STR(DUMMY2), + TRANS_TYPE_STR(QM_QUOTAOFF), + TRANS_TYPE_STR(QM_DQALLOC), + TRANS_TYPE_STR(QM_SETQLIM), + TRANS_TYPE_STR(QM_DQCLUSTER), + TRANS_TYPE_STR(QM_QINOCREATE), + TRANS_TYPE_STR(QM_QUOTAOFF_END), + TRANS_TYPE_STR(FSYNC_TS), + TRANS_TYPE_STR(GROWFSRT_ALLOC), + TRANS_TYPE_STR(GROWFSRT_ZERO), + TRANS_TYPE_STR(GROWFSRT_FREE), + TRANS_TYPE_STR(SWAPEXT), + TRANS_TYPE_STR(CHECKPOINT), + TRANS_TYPE_STR(ICREATE), + TRANS_TYPE_STR(CREATE_TMPFILE) }; +#undef TRANS_TYPE_STR xfs_warn(mp, "xlog_write: reservation summary:"); xfs_warn(mp, " trans type = %s (%u)", ((ticket->t_trans_type <= 0 || ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? - "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), + "bad-trans-type" : trans_type_str[ticket->t_trans_type]), ticket->t_trans_type); xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); @@ -2101,7 +2101,7 @@ xlog_print_tic_res( uint r_type = ticket->t_res_arr[i].r_type; xfs_warn(mp, "region[%u]: %s - %u bytes", i, ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? - "bad-rtype" : res_type_str[r_type-1]), + "bad-rtype" : res_type_str[r_type]), ticket->t_res_arr[i].r_len); } @@ -3979,7 +3979,7 @@ xfs_log_force_umount( log->l_flags & XLOG_ACTIVE_RECOVERY) { mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; if (mp->m_sb_bp) - XFS_BUF_DONE(mp->m_sb_bp); + mp->m_sb_bp->b_flags |= XBF_DONE; return 0; } @@ -4009,7 +4009,7 @@ xfs_log_force_umount( spin_lock(&log->l_icloglock); mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; if (mp->m_sb_bp) - XFS_BUF_DONE(mp->m_sb_bp); + mp->m_sb_bp->b_flags |= XBF_DONE; /* * Mark the log and the iclogs with IO error flags to prevent any diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index be5568839442..396565f43247 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -190,7 +190,7 @@ xlog_bread_noalign( ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); - XFS_BUF_READ(bp); + bp->b_flags |= XBF_READ; bp->b_io_length = nbblks; bp->b_error = 0; @@ -275,7 +275,6 @@ xlog_bwrite( ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); - XFS_BUF_ZEROFLAGS(bp); xfs_buf_hold(bp); xfs_buf_lock(bp); bp->b_io_length = nbblks; @@ -2538,6 +2537,13 @@ xlog_recover_validate_buf_type( } bp->b_ops = &xfs_sb_buf_ops; break; +#ifdef CONFIG_XFS_RT + case XFS_BLFT_RTBITMAP_BUF: + case XFS_BLFT_RTSUMMARY_BUF: + /* no magic numbers for verification of RT buffers */ + bp->b_ops = &xfs_rtbuf_ops; + break; +#endif /* CONFIG_XFS_RT */ default: xfs_warn(mp, "Unknown buffer type %d!", xfs_blft_from_flags(buf_f)); @@ -2858,7 +2864,7 @@ xfs_recover_inode_owner_change( return -ENOMEM; /* instantiate the inode */ - xfs_dinode_from_disk(&ip->i_d, dip); + xfs_inode_from_disk(ip, dip); ASSERT(ip->i_d.di_version >= 3); error = xfs_iformat_fork(ip, dip); @@ -2904,7 +2910,7 @@ xlog_recover_inode_pass2( int error; int attr_index; uint fields; - xfs_icdinode_t *dicp; + struct xfs_log_dinode *ldip; uint isize; int need_free = 0; @@ -2957,8 +2963,8 @@ xlog_recover_inode_pass2( error = -EFSCORRUPTED; goto out_release; } - dicp = item->ri_buf[1].i_addr; - if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { + ldip = item->ri_buf[1].i_addr; + if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", __func__, item, in_f->ilf_ino); @@ -2994,13 +3000,13 @@ xlog_recover_inode_pass2( * to skip replay when the on disk inode is newer than the log one */ if (!xfs_sb_version_hascrc(&mp->m_sb) && - dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers */ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && - dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { + ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { trace_xfs_log_recover_inode_skip(log, in_f); @@ -3010,13 +3016,13 @@ xlog_recover_inode_pass2( } /* Take the opportunity to reset the flush iteration count */ - dicp->di_flushiter = 0; + ldip->di_flushiter = 0; - if (unlikely(S_ISREG(dicp->di_mode))) { - if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && - (dicp->di_format != XFS_DINODE_FMT_BTREE)) { + if (unlikely(S_ISREG(ldip->di_mode))) { + if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && + (ldip->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", - XFS_ERRLEVEL_LOW, mp, dicp); + XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, "%s: Bad regular inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", @@ -3024,12 +3030,12 @@ xlog_recover_inode_pass2( error = -EFSCORRUPTED; goto out_release; } - } else if (unlikely(S_ISDIR(dicp->di_mode))) { - if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && - (dicp->di_format != XFS_DINODE_FMT_BTREE) && - (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { + } else if (unlikely(S_ISDIR(ldip->di_mode))) { + if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && + (ldip->di_format != XFS_DINODE_FMT_BTREE) && + (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", - XFS_ERRLEVEL_LOW, mp, dicp); + XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, "%s: Bad dir inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", @@ -3038,32 +3044,32 @@ xlog_recover_inode_pass2( goto out_release; } } - if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ + if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", - XFS_ERRLEVEL_LOW, mp, dicp); + XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", __func__, item, dip, bp, in_f->ilf_ino, - dicp->di_nextents + dicp->di_anextents, - dicp->di_nblocks); + ldip->di_nextents + ldip->di_anextents, + ldip->di_nblocks); error = -EFSCORRUPTED; goto out_release; } - if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { + if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", - XFS_ERRLEVEL_LOW, mp, dicp); + XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, - item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); + item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); error = -EFSCORRUPTED; goto out_release; } - isize = xfs_icdinode_size(dicp->di_version); + isize = xfs_log_dinode_size(ldip->di_version); if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", - XFS_ERRLEVEL_LOW, mp, dicp); + XFS_ERRLEVEL_LOW, mp, ldip); xfs_alert(mp, "%s: Bad inode log record length %d, rec ptr 0x%p", __func__, item->ri_buf[1].i_len, item); @@ -3071,8 +3077,8 @@ xlog_recover_inode_pass2( goto out_release; } - /* The core is in in-core format */ - xfs_dinode_to_disk(dip, dicp); + /* recover the log dinode inode into the on disk inode */ + xfs_log_dinode_to_disk(ldip, dip); /* the rest is in on-disk format */ if (item->ri_buf[1].i_len > isize) { @@ -4402,8 +4408,8 @@ xlog_recover_process_one_iunlink( if (error) goto fail_iput; - ASSERT(ip->i_d.di_nlink == 0); - ASSERT(ip->i_d.di_mode != 0); + ASSERT(VFS_I(ip)->i_nlink == 0); + ASSERT(VFS_I(ip)->i_mode != 0); /* setup for the next pass */ agino = be32_to_cpu(dip->di_next_unlinked); @@ -4957,6 +4963,7 @@ xlog_do_recover( xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { + struct xfs_mount *mp = log->l_mp; int error; xfs_buf_t *bp; xfs_sb_t *sbp; @@ -4971,7 +4978,7 @@ xlog_do_recover( /* * If IO errors happened during recovery, bail out. */ - if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + if (XFS_FORCED_SHUTDOWN(mp)) { return -EIO; } @@ -4984,22 +4991,21 @@ xlog_do_recover( * or iunlinks they will have some entries in the AIL; so we look at * the AIL to determine how to set the tail_lsn. */ - xlog_assign_tail_lsn(log->l_mp); + xlog_assign_tail_lsn(mp); /* * Now that we've finished replaying all buffer and inode * updates, re-read in the superblock and reverify it. */ - bp = xfs_getsb(log->l_mp, 0); - XFS_BUF_UNDONE(bp); - ASSERT(!(XFS_BUF_ISWRITE(bp))); - XFS_BUF_READ(bp); - XFS_BUF_UNASYNC(bp); + bp = xfs_getsb(mp, 0); + bp->b_flags &= ~(XBF_DONE | XBF_ASYNC); + ASSERT(!(bp->b_flags & XBF_WRITE)); + bp->b_flags |= XBF_READ; bp->b_ops = &xfs_sb_buf_ops; error = xfs_buf_submit_wait(bp); if (error) { - if (!XFS_FORCED_SHUTDOWN(log->l_mp)) { + if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_buf_ioerror_alert(bp, __func__); ASSERT(0); } @@ -5008,14 +5014,17 @@ xlog_do_recover( } /* Convert superblock from on-disk format */ - sbp = &log->l_mp->m_sb; + sbp = &mp->m_sb; xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); - ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); - ASSERT(xfs_sb_good_version(sbp)); - xfs_reinit_percpu_counters(log->l_mp); - xfs_buf_relse(bp); + /* re-initialise in-core superblock and geometry structures */ + xfs_reinit_percpu_counters(mp); + error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); + return error; + } xlog_recover_check_summary(log); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bb753b359bee..536a0ee9cd5a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -185,9 +185,6 @@ xfs_initialize_perag( xfs_agnumber_t index; xfs_agnumber_t first_initialised = 0; xfs_perag_t *pag; - xfs_agino_t agino; - xfs_ino_t ino; - xfs_sb_t *sbp = &mp->m_sb; int error = -ENOMEM; /* @@ -230,22 +227,7 @@ xfs_initialize_perag( radix_tree_preload_end(); } - /* - * If we mount with the inode64 option, or no inode overflows - * the legacy 32-bit address space clear the inode32 option. - */ - agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); - ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); - - if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) - mp->m_flags |= XFS_MOUNT_32BITINODES; - else - mp->m_flags &= ~XFS_MOUNT_32BITINODES; - - if (mp->m_flags & XFS_MOUNT_32BITINODES) - index = xfs_set_inode32(mp, agcount); - else - index = xfs_set_inode64(mp, agcount); + index = xfs_set_inode_alloc(mp, agcount); if (maxagi) *maxagi = index; @@ -865,7 +847,7 @@ xfs_mountfs( ASSERT(rip != NULL); - if (unlikely(!S_ISDIR(rip->i_d.di_mode))) { + if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) { xfs_warn(mp, "corrupted root inode %llu: not a directory", (unsigned long long)rip->i_ino); xfs_iunlock(rip, XFS_ILOCK_EXCL); @@ -1284,7 +1266,7 @@ xfs_getsb( } xfs_buf_hold(bp); - ASSERT(XFS_BUF_ISDONE(bp)); + ASSERT(bp->b_flags & XBF_DONE); return bp; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b57098481c10..bac6b3435591 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -147,6 +147,17 @@ typedef struct xfs_mount { * to various other kinds of pain inflicted on the pNFS server. */ __uint32_t m_generation; + +#ifdef DEBUG + /* + * DEBUG mode instrumentation to test and/or trigger delayed allocation + * block killing in the event of failed writes. When enabled, all + * buffered writes are forced to fail. All delalloc blocks in the range + * of the write (including pre-existing delalloc blocks!) are tossed as + * part of the write failure error handling sequence. + */ + bool m_fail_writes; +#endif } xfs_mount_t; /* @@ -166,9 +177,8 @@ typedef struct xfs_mount { #define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ #define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ #define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ -#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above - * 32 bits in size */ -#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */ +#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */ +#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */ #define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ #define XFS_MOUNT_BARRIER (1ULL << 17) #define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/ @@ -264,6 +274,20 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } +#ifdef DEBUG +static inline bool +xfs_mp_fail_writes(struct xfs_mount *mp) +{ + return mp->m_fail_writes; +} +#else +static inline bool +xfs_mp_fail_writes(struct xfs_mount *mp) +{ + return 0; +} +#endif + /* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. @@ -327,7 +351,6 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); -extern int xfs_mount_log_sb(xfs_mount_t *); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h new file mode 100644 index 000000000000..184c44effdd5 --- /dev/null +++ b/fs/xfs/xfs_ondisk.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016 Oracle. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ONDISK_H +#define __XFS_ONDISK_H + +#define XFS_CHECK_STRUCT_SIZE(structname, size) \ + BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ + #structname ") is wrong, expected " #size) + +static inline void __init +xfs_check_ondisk_structs(void) +{ + /* ag/file structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_acl, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12); + XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); + XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); + XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336); + XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); + XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); + XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); + XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136); + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264); + XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); + XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); + XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); + XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); + XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); + + /* dir/attr trees */ + XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56); + XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56); + XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_da3_node_hdr, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_blk_hdr, 48); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_data_hdr, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64); + XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t, 8); + XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t, 32); + XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4); + + /* + * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to + * 4 bytes anyway so it's not obviously a problem. Hence for the moment + * we don't check this structure. This can be re-instated when the attr + * definitions are updated to use c99 VLA definitions. + * + XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); + */ + + XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); + XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8); + XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); + XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); + XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); + XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); + XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2); + + /* log structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12); + XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); + XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); + XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); + XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56); + XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); + XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); +} + +#endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 532ab79d38fe..be125e1758c1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -560,6 +560,37 @@ xfs_qm_shrink_count( return list_lru_shrink_count(&qi->qi_lru, sc); } +STATIC void +xfs_qm_set_defquota( + xfs_mount_t *mp, + uint type, + xfs_quotainfo_t *qinf) +{ + xfs_dquot_t *dqp; + struct xfs_def_quota *defq; + int error; + + error = xfs_qm_dqread(mp, 0, type, XFS_QMOPT_DOWARN, &dqp); + + if (!error) { + xfs_disk_dquot_t *ddqp = &dqp->q_core; + + defq = xfs_get_defquota(dqp, qinf); + + /* + * Timers and warnings have been already set, let's just set the + * default limits for this quota type + */ + defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); + defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); + defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + xfs_qm_dqdestroy(dqp); + } +} + /* * This initializes all the quota information that's kept in the * mount structure @@ -606,19 +637,19 @@ xfs_qm_init_quotainfo( * We try to get the limits from the superuser's limits fields. * This is quite hacky, but it is standard quota practice. * - * We look at the USR dquot with id == 0 first, but if user quotas - * are not enabled we goto the GRP dquot with id == 0. - * We don't really care to keep separate default limits for user - * and group quotas, at least not at this point. - * * Since we may not have done a quotacheck by this point, just read * the dquot without attaching it to any hashtables or lists. + * + * Timers and warnings are globally set by the first timer found in + * user/group/proj quota types, otherwise a default value is used. + * This should be split into different fields per quota type. */ error = xfs_qm_dqread(mp, 0, XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : XFS_DQ_PROJ), XFS_QMOPT_DOWARN, &dqp); + if (!error) { xfs_disk_dquot_t *ddqp = &dqp->q_core; @@ -639,13 +670,6 @@ xfs_qm_init_quotainfo( be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT; qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ? be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT; - qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); - qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); - qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); - qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); - qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); - qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -656,6 +680,13 @@ xfs_qm_init_quotainfo( qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; } + if (XFS_IS_UQUOTA_RUNNING(mp)) + xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); + if (XFS_IS_GQUOTA_RUNNING(mp)) + xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf); + if (XFS_IS_PQUOTA_RUNNING(mp)) + xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf); + qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; qinf->qi_shrinker.seeks = DEFAULT_SEEKS; diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 996a04064894..2975a822e9f0 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -53,6 +53,15 @@ extern struct kmem_zone *xfs_qm_dqtrxzone; */ #define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 +struct xfs_def_quota { + xfs_qcnt_t bhardlimit; /* default data blk hard limit */ + xfs_qcnt_t bsoftlimit; /* default data blk soft limit */ + xfs_qcnt_t ihardlimit; /* default inode count hard limit */ + xfs_qcnt_t isoftlimit; /* default inode count soft limit */ + xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */ + xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */ +}; + /* * Various quota information for individual filesystems. * The mount structure keeps a pointer to this. @@ -76,12 +85,9 @@ typedef struct xfs_quotainfo { struct mutex qi_quotaofflock;/* to serialize quotaoff */ xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ uint qi_dqperchunk; /* # ondisk dqs in above chunk */ - xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ - xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */ - xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */ - xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */ - xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */ - xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */ + struct xfs_def_quota qi_usr_default; + struct xfs_def_quota qi_grp_default; + struct xfs_def_quota qi_prj_default; struct shrinker qi_shrinker; } xfs_quotainfo_t; @@ -104,15 +110,15 @@ xfs_dquot_tree( } static inline struct xfs_inode * -xfs_dq_to_quota_inode(struct xfs_dquot *dqp) +xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) { - switch (dqp->dq_flags & XFS_DQ_ALLTYPES) { + switch (dq_flags & XFS_DQ_ALLTYPES) { case XFS_DQ_USER: - return dqp->q_mount->m_quotainfo->qi_uquotaip; + return mp->m_quotainfo->qi_uquotaip; case XFS_DQ_GROUP: - return dqp->q_mount->m_quotainfo->qi_gquotaip; + return mp->m_quotainfo->qi_gquotaip; case XFS_DQ_PROJ: - return dqp->q_mount->m_quotainfo->qi_pquotaip; + return mp->m_quotainfo->qi_pquotaip; default: ASSERT(0); } @@ -164,11 +170,27 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); -extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, - uint, struct qc_dqblk *); +extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *, + uint, struct qc_dqblk *, uint); extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, struct qc_dqblk *); extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); +static inline struct xfs_def_quota * +xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi) +{ + struct xfs_def_quota *defq; + + if (XFS_QM_ISUDQ(dqp)) + defq = &qi->qi_usr_default; + else if (XFS_QM_ISGDQ(dqp)) + defq = &qi->qi_grp_default; + else { + ASSERT(XFS_QM_ISPDQ(dqp)); + defq = &qi->qi_prj_default; + } + return defq; +} + #endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 3640c6e896af..f4d0e0a8f517 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -404,6 +404,7 @@ xfs_qm_scall_setqlim( struct xfs_disk_dquot *ddq; struct xfs_dquot *dqp; struct xfs_trans *tp; + struct xfs_def_quota *defq; int error; xfs_qcnt_t hard, soft; @@ -431,6 +432,8 @@ xfs_qm_scall_setqlim( ASSERT(error != -ENOENT); goto out_unlock; } + + defq = xfs_get_defquota(dqp, q); xfs_dqunlock(dqp); tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); @@ -458,8 +461,8 @@ xfs_qm_scall_setqlim( ddq->d_blk_softlimit = cpu_to_be64(soft); xfs_dquot_set_prealloc_limits(dqp); if (id == 0) { - q->qi_bhardlimit = hard; - q->qi_bsoftlimit = soft; + defq->bhardlimit = hard; + defq->bsoftlimit = soft; } } else { xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); @@ -474,8 +477,8 @@ xfs_qm_scall_setqlim( ddq->d_rtb_hardlimit = cpu_to_be64(hard); ddq->d_rtb_softlimit = cpu_to_be64(soft); if (id == 0) { - q->qi_rtbhardlimit = hard; - q->qi_rtbsoftlimit = soft; + defq->rtbhardlimit = hard; + defq->rtbsoftlimit = soft; } } else { xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); @@ -491,8 +494,8 @@ xfs_qm_scall_setqlim( ddq->d_ino_hardlimit = cpu_to_be64(hard); ddq->d_ino_softlimit = cpu_to_be64(soft); if (id == 0) { - q->qi_ihardlimit = hard; - q->qi_isoftlimit = soft; + defq->ihardlimit = hard; + defq->isoftlimit = soft; } } else { xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); @@ -635,9 +638,10 @@ out: int xfs_qm_scall_getquota( struct xfs_mount *mp, - xfs_dqid_t id, + xfs_dqid_t *id, uint type, - struct qc_dqblk *dst) + struct qc_dqblk *dst, + uint dqget_flags) { struct xfs_dquot *dqp; int error; @@ -647,7 +651,7 @@ xfs_qm_scall_getquota( * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't * exist, we'll get ENOENT back. */ - error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp); + error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp); if (error) return error; @@ -660,6 +664,9 @@ xfs_qm_scall_getquota( goto out_put; } + /* Fill in the ID we actually read from disk */ + *id = be32_to_cpu(dqp->q_core.d_id); + memset(dst, 0, sizeof(*dst)); dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); @@ -701,7 +708,7 @@ xfs_qm_scall_getquota( if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && - id != 0) { + *id != 0) { if ((dst->d_space > dst->d_spc_softlimit) && (dst->d_spc_softlimit > 0)) { ASSERT(dst->d_spc_timer != 0); diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 7795e0d01382..f82d79a8c694 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -231,14 +231,45 @@ xfs_fs_get_dqblk( struct qc_dqblk *qdq) { struct xfs_mount *mp = XFS_M(sb); + xfs_dqid_t id; if (!XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), - xfs_quota_type(qid.type), qdq); + id = from_kqid(&init_user_ns, qid); + return xfs_qm_scall_getquota(mp, &id, + xfs_quota_type(qid.type), qdq, 0); +} + +/* Return quota info for active quota >= this qid */ +STATIC int +xfs_fs_get_nextdqblk( + struct super_block *sb, + struct kqid *qid, + struct qc_dqblk *qdq) +{ + int ret; + struct xfs_mount *mp = XFS_M(sb); + xfs_dqid_t id; + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + id = from_kqid(&init_user_ns, *qid); + ret = xfs_qm_scall_getquota(mp, &id, + xfs_quota_type(qid->type), qdq, + XFS_QMOPT_DQNEXT); + if (ret) + return ret; + + /* ID may be different, so convert back what we got */ + *qid = make_kqid(current_user_ns(), qid->type, id); + return 0; + } STATIC int @@ -267,5 +298,6 @@ const struct quotactl_ops xfs_quotactl_operations = { .quota_disable = xfs_quota_disable, .rm_xquota = xfs_fs_rm_xquota, .get_dqblk = xfs_fs_get_dqblk, + .get_nextdqblk = xfs_fs_get_nextdqblk, .set_dqblk = xfs_fs_set_dqblk, }; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index be02a68b2fe2..abf44435d04a 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1272,7 +1272,7 @@ xfs_rtpick_extent( ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime; + seqp = (__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime; if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; *seqp = 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 59c9b7bd958d..d760934109b5 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -45,6 +45,7 @@ #include "xfs_filestream.h" #include "xfs_quota.h" #include "xfs_sysfs.h" +#include "xfs_ondisk.h" #include <linux/namei.h> #include <linux/init.h> @@ -65,83 +66,85 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif -#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ -#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ -#define MNTOPT_LOGDEV "logdev" /* log device */ -#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ -#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ -#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ -#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ -#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ -#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ -#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */ -#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */ -#define MNTOPT_MTPT "mtpt" /* filesystem mount point */ -#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */ -#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */ -#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */ -#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */ -#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */ -#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ -#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and - * unwritten extent conversion */ -#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ -#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ -#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to - * XFS_MAXINUMBER_32 */ -#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ -#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ -#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ -#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes - * in stat(). */ -#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ -#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ -#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */ -#define MNTOPT_QUOTA "quota" /* disk quotas (user) */ -#define MNTOPT_NOQUOTA "noquota" /* no quotas */ -#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */ -#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */ -#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */ -#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */ -#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */ -#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */ -#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ -#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ -#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ -#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ -#define MNTOPT_DISCARD "discard" /* Discard unused blocks */ -#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ - -#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */ - /* * Table driven mount option parser. - * - * Currently only used for remount, but it will be used for mount - * in the future, too. */ enum { - Opt_barrier, - Opt_nobarrier, - Opt_inode64, - Opt_inode32, - Opt_err + Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize, + Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, + Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, + Opt_allocsize, Opt_norecovery, Opt_barrier, Opt_nobarrier, + Opt_inode64, Opt_inode32, Opt_ikeep, Opt_noikeep, + Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, Opt_filestreams, + Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, + Opt_uquota, Opt_gquota, Opt_pquota, + Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, }; static const match_table_t tokens = { - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_inode64, "inode64"}, - {Opt_inode32, "inode32"}, - {Opt_err, NULL} + {Opt_logbufs, "logbufs=%u"}, /* number of XFS log buffers */ + {Opt_logbsize, "logbsize=%s"}, /* size of XFS log buffers */ + {Opt_logdev, "logdev=%s"}, /* log device */ + {Opt_rtdev, "rtdev=%s"}, /* realtime I/O device */ + {Opt_biosize, "biosize=%u"}, /* log2 of preferred buffered io size */ + {Opt_wsync, "wsync"}, /* safe-mode nfs compatible mount */ + {Opt_noalign, "noalign"}, /* turn off stripe alignment */ + {Opt_swalloc, "swalloc"}, /* turn on stripe width allocation */ + {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */ + {Opt_swidth, "swidth=%u"}, /* data volume stripe width */ + {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */ + {Opt_mtpt, "mtpt"}, /* filesystem mount point */ + {Opt_grpid, "grpid"}, /* group-ID from parent directory */ + {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */ + {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */ + {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */ + {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */ + {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */ + {Opt_barrier, "barrier"}, /* use writer barriers for log write and + * unwritten extent conversion */ + {Opt_nobarrier, "nobarrier"}, /* .. disable */ + {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */ + {Opt_inode32, "inode32"}, /* inode allocation limited to + * XFS_MAXINUMBER_32 */ + {Opt_ikeep, "ikeep"}, /* do not free empty inode clusters */ + {Opt_noikeep, "noikeep"}, /* free empty inode clusters */ + {Opt_largeio, "largeio"}, /* report large I/O sizes in stat() */ + {Opt_nolargeio, "nolargeio"}, /* do not report large I/O sizes + * in stat(). */ + {Opt_attr2, "attr2"}, /* do use attr2 attribute format */ + {Opt_noattr2, "noattr2"}, /* do not use attr2 attribute format */ + {Opt_filestreams,"filestreams"},/* use filestreams allocator */ + {Opt_quota, "quota"}, /* disk quotas (user) */ + {Opt_noquota, "noquota"}, /* no quotas */ + {Opt_usrquota, "usrquota"}, /* user quota enabled */ + {Opt_grpquota, "grpquota"}, /* group quota enabled */ + {Opt_prjquota, "prjquota"}, /* project quota enabled */ + {Opt_uquota, "uquota"}, /* user quota (IRIX variant) */ + {Opt_gquota, "gquota"}, /* group quota (IRIX variant) */ + {Opt_pquota, "pquota"}, /* project quota (IRIX variant) */ + {Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */ + {Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */ + {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */ + {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */ + {Opt_discard, "discard"}, /* Discard unused blocks */ + {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */ + + {Opt_dax, "dax"}, /* Enable direct access to bdev pages */ + {Opt_err, NULL}, }; STATIC int -suffix_kstrtoint(char *s, unsigned int base, int *res) +suffix_kstrtoint(const substring_t *s, unsigned int base, int *res) { int last, shift_left_factor = 0, _res; - char *value = s; + char *value; + int ret = 0; + + value = match_strdup(s); + if (!value) + return -ENOMEM; last = strlen(value) - 1; if (value[last] == 'K' || value[last] == 'k') { @@ -157,10 +160,11 @@ suffix_kstrtoint(char *s, unsigned int base, int *res) value[last] = '\0'; } - if (kstrtoint(s, base, &_res)) - return -EINVAL; + if (kstrtoint(value, base, &_res)) + ret = -EINVAL; + kfree(value); *res = _res << shift_left_factor; - return 0; + return ret; } /* @@ -169,14 +173,19 @@ suffix_kstrtoint(char *s, unsigned int base, int *res) * * Note that this function leaks the various device name allocations on * failure. The caller takes care of them. + * + * *sb is const because this is also used to test options on the remount + * path, and we don't want this to have any side effects at remount time. + * Today this function does not change *sb, but just to future-proof... */ STATIC int xfs_parseargs( struct xfs_mount *mp, char *options) { - struct super_block *sb = mp->m_super; - char *this_char, *value; + const struct super_block *sb = mp->m_super; + char *p; + substring_t args[MAX_OPT_ARGS]; int dsunit = 0; int dswidth = 0; int iosize = 0; @@ -217,152 +226,152 @@ xfs_parseargs( if (!options) goto done; - while ((this_char = strsep(&options, ",")) != NULL) { - if (!*this_char) + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) continue; - if ((value = strchr(this_char, '=')) != NULL) - *value++ = 0; - if (!strcmp(this_char, MNTOPT_LOGBUFS)) { - if (!value || !*value) { - xfs_warn(mp, "%s option requires an argument", - this_char); - return -EINVAL; - } - if (kstrtoint(value, 10, &mp->m_logbufs)) - return -EINVAL; - } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { - if (!value || !*value) { - xfs_warn(mp, "%s option requires an argument", - this_char); - return -EINVAL; - } - if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) + token = match_token(p, tokens, args); + switch (token) { + case Opt_logbufs: + if (match_int(args, &mp->m_logbufs)) return -EINVAL; - } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { - if (!value || !*value) { - xfs_warn(mp, "%s option requires an argument", - this_char); + break; + case Opt_logbsize: + if (suffix_kstrtoint(args, 10, &mp->m_logbsize)) return -EINVAL; - } - mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + break; + case Opt_logdev: + mp->m_logname = match_strdup(args); if (!mp->m_logname) return -ENOMEM; - } else if (!strcmp(this_char, MNTOPT_MTPT)) { - xfs_warn(mp, "%s option not allowed on this system", - this_char); + break; + case Opt_mtpt: + xfs_warn(mp, "%s option not allowed on this system", p); return -EINVAL; - } else if (!strcmp(this_char, MNTOPT_RTDEV)) { - if (!value || !*value) { - xfs_warn(mp, "%s option requires an argument", - this_char); - return -EINVAL; - } - mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + case Opt_rtdev: + mp->m_rtname = match_strdup(args); if (!mp->m_rtname) return -ENOMEM; - } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) || - !strcmp(this_char, MNTOPT_BIOSIZE)) { - if (!value || !*value) { - xfs_warn(mp, "%s option requires an argument", - this_char); - return -EINVAL; - } - if (suffix_kstrtoint(value, 10, &iosize)) + break; + case Opt_allocsize: + case Opt_biosize: + if (suffix_kstrtoint(args, 10, &iosize)) return -EINVAL; iosizelog = ffs(iosize) - 1; - } else if (!strcmp(this_char, MNTOPT_GRPID) || - !strcmp(this_char, MNTOPT_BSDGROUPS)) { + break; + case Opt_grpid: + case Opt_bsdgroups: mp->m_flags |= XFS_MOUNT_GRPID; - } else if (!strcmp(this_char, MNTOPT_NOGRPID) || - !strcmp(this_char, MNTOPT_SYSVGROUPS)) { + break; + case Opt_nogrpid: + case Opt_sysvgroups: mp->m_flags &= ~XFS_MOUNT_GRPID; - } else if (!strcmp(this_char, MNTOPT_WSYNC)) { + break; + case Opt_wsync: mp->m_flags |= XFS_MOUNT_WSYNC; - } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { + break; + case Opt_norecovery: mp->m_flags |= XFS_MOUNT_NORECOVERY; - } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { + break; + case Opt_noalign: mp->m_flags |= XFS_MOUNT_NOALIGN; - } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { + break; + case Opt_swalloc: mp->m_flags |= XFS_MOUNT_SWALLOC; - } else if (!strcmp(this_char, MNTOPT_SUNIT)) { - if (!value || !*value) { - xfs_warn(mp, "%s option requires an argument", - this_char); - return -EINVAL; - } - if (kstrtoint(value, 10, &dsunit)) - return -EINVAL; - } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { - if (!value || !*value) { - xfs_warn(mp, "%s option requires an argument", - this_char); + break; + case Opt_sunit: + if (match_int(args, &dsunit)) return -EINVAL; - } - if (kstrtoint(value, 10, &dswidth)) + break; + case Opt_swidth: + if (match_int(args, &dswidth)) return -EINVAL; - } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { + break; + case Opt_inode32: mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { + break; + case Opt_inode64: mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; - } else if (!strcmp(this_char, MNTOPT_NOUUID)) { + break; + case Opt_nouuid: mp->m_flags |= XFS_MOUNT_NOUUID; - } else if (!strcmp(this_char, MNTOPT_BARRIER)) { + break; + case Opt_barrier: mp->m_flags |= XFS_MOUNT_BARRIER; - } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { + break; + case Opt_nobarrier: mp->m_flags &= ~XFS_MOUNT_BARRIER; - } else if (!strcmp(this_char, MNTOPT_IKEEP)) { + break; + case Opt_ikeep: mp->m_flags |= XFS_MOUNT_IKEEP; - } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { + break; + case Opt_noikeep: mp->m_flags &= ~XFS_MOUNT_IKEEP; - } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { + break; + case Opt_largeio: mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; - } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { + break; + case Opt_nolargeio: mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - } else if (!strcmp(this_char, MNTOPT_ATTR2)) { + break; + case Opt_attr2: mp->m_flags |= XFS_MOUNT_ATTR2; - } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { + break; + case Opt_noattr2: mp->m_flags &= ~XFS_MOUNT_ATTR2; mp->m_flags |= XFS_MOUNT_NOATTR2; - } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { + break; + case Opt_filestreams: mp->m_flags |= XFS_MOUNT_FILESTREAMS; - } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { + break; + case Opt_noquota: mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; - } else if (!strcmp(this_char, MNTOPT_QUOTA) || - !strcmp(this_char, MNTOPT_UQUOTA) || - !strcmp(this_char, MNTOPT_USRQUOTA)) { + break; + case Opt_quota: + case Opt_uquota: + case Opt_usrquota: mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | XFS_UQUOTA_ENFD); - } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || - !strcmp(this_char, MNTOPT_UQUOTANOENF)) { + break; + case Opt_qnoenforce: + case Opt_uqnoenforce: mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); mp->m_qflags &= ~XFS_UQUOTA_ENFD; - } else if (!strcmp(this_char, MNTOPT_PQUOTA) || - !strcmp(this_char, MNTOPT_PRJQUOTA)) { + break; + case Opt_pquota: + case Opt_prjquota: mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | XFS_PQUOTA_ENFD); - } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { + break; + case Opt_pqnoenforce: mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); mp->m_qflags &= ~XFS_PQUOTA_ENFD; - } else if (!strcmp(this_char, MNTOPT_GQUOTA) || - !strcmp(this_char, MNTOPT_GRPQUOTA)) { + case Opt_gquota: + case Opt_grpquota: mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | XFS_GQUOTA_ENFD); - } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { + break; + case Opt_gqnoenforce: mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_GQUOTA_ENFD; - } else if (!strcmp(this_char, MNTOPT_DISCARD)) { + break; + case Opt_discard: mp->m_flags |= XFS_MOUNT_DISCARD; - } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { + break; + case Opt_nodiscard: mp->m_flags &= ~XFS_MOUNT_DISCARD; + break; #ifdef CONFIG_FS_DAX - } else if (!strcmp(this_char, MNTOPT_DAX)) { + case Opt_dax: mp->m_flags |= XFS_MOUNT_DAX; + break; #endif - } else { - xfs_warn(mp, "unknown mount option [%s].", this_char); + default: + xfs_warn(mp, "unknown mount option [%s].", p); return -EINVAL; } } @@ -461,25 +470,25 @@ xfs_showargs( { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ - { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, - { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, - { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, - { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, - { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, - { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, - { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, - { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, - { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, - { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, - { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, - { XFS_MOUNT_DAX, "," MNTOPT_DAX }, + { XFS_MOUNT_IKEEP, ",ikeep" }, + { XFS_MOUNT_WSYNC, ",wsync" }, + { XFS_MOUNT_NOALIGN, ",noalign" }, + { XFS_MOUNT_SWALLOC, ",swalloc" }, + { XFS_MOUNT_NOUUID, ",nouuid" }, + { XFS_MOUNT_NORECOVERY, ",norecovery" }, + { XFS_MOUNT_ATTR2, ",attr2" }, + { XFS_MOUNT_FILESTREAMS, ",filestreams" }, + { XFS_MOUNT_GRPID, ",grpid" }, + { XFS_MOUNT_DISCARD, ",discard" }, + { XFS_MOUNT_SMALL_INUMS, ",inode32" }, + { XFS_MOUNT_DAX, ",dax" }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { /* the few simple ones we can get from the mount struct */ - { XFS_MOUNT_COMPAT_IOSIZE, "," MNTOPT_LARGEIO }, - { XFS_MOUNT_BARRIER, "," MNTOPT_NOBARRIER }, - { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_64BITINODE }, + { XFS_MOUNT_COMPAT_IOSIZE, ",largeio" }, + { XFS_MOUNT_BARRIER, ",nobarrier" }, + { XFS_MOUNT_SMALL_INUMS, ",inode64" }, { 0, NULL } }; struct proc_xfs_info *xfs_infop; @@ -494,46 +503,46 @@ xfs_showargs( } if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) - seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk", + seq_printf(m, ",allocsize=%dk", (int)(1 << mp->m_writeio_log) >> 10); if (mp->m_logbufs > 0) - seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); + seq_printf(m, ",logbufs=%d", mp->m_logbufs); if (mp->m_logbsize > 0) - seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); + seq_printf(m, ",logbsize=%dk", mp->m_logbsize >> 10); if (mp->m_logname) - seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname); + seq_show_option(m, "logdev", mp->m_logname); if (mp->m_rtname) - seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname); + seq_show_option(m, "rtdev", mp->m_rtname); if (mp->m_dalign > 0) - seq_printf(m, "," MNTOPT_SUNIT "=%d", + seq_printf(m, ",sunit=%d", (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); if (mp->m_swidth > 0) - seq_printf(m, "," MNTOPT_SWIDTH "=%d", + seq_printf(m, ",swidth=%d", (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD)) - seq_puts(m, "," MNTOPT_USRQUOTA); + seq_puts(m, ",usrquota"); else if (mp->m_qflags & XFS_UQUOTA_ACCT) - seq_puts(m, "," MNTOPT_UQUOTANOENF); + seq_puts(m, ",uqnoenforce"); if (mp->m_qflags & XFS_PQUOTA_ACCT) { if (mp->m_qflags & XFS_PQUOTA_ENFD) - seq_puts(m, "," MNTOPT_PRJQUOTA); + seq_puts(m, ",prjquota"); else - seq_puts(m, "," MNTOPT_PQUOTANOENF); + seq_puts(m, ",pqnoenforce"); } if (mp->m_qflags & XFS_GQUOTA_ACCT) { if (mp->m_qflags & XFS_GQUOTA_ENFD) - seq_puts(m, "," MNTOPT_GRPQUOTA); + seq_puts(m, ",grpquota"); else - seq_puts(m, "," MNTOPT_GQUOTANOENF); + seq_puts(m, ",gqnoenforce"); } if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) - seq_puts(m, "," MNTOPT_NOQUOTA); + seq_puts(m, ",noquota"); return 0; } @@ -572,23 +581,35 @@ xfs_max_file_offset( } /* - * xfs_set_inode32() and xfs_set_inode64() are passed an agcount - * because in the growfs case, mp->m_sb.sb_agcount is not updated - * yet to the potentially higher ag count. + * Set parameters for inode allocation heuristics, taking into account + * filesystem size and inode32/inode64 mount options; i.e. specifically + * whether or not XFS_MOUNT_SMALL_INUMS is set. + * + * Inode allocation patterns are altered only if inode32 is requested + * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large. + * If altered, XFS_MOUNT_32BITINODES is set as well. + * + * An agcount independent of that in the mount structure is provided + * because in the growfs case, mp->m_sb.sb_agcount is not yet updated + * to the potentially higher ag count. + * + * Returns the maximum AG index which may contain inodes. */ xfs_agnumber_t -xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount) +xfs_set_inode_alloc( + struct xfs_mount *mp, + xfs_agnumber_t agcount) { - xfs_agnumber_t index = 0; + xfs_agnumber_t index; xfs_agnumber_t maxagi = 0; xfs_sb_t *sbp = &mp->m_sb; xfs_agnumber_t max_metadata; xfs_agino_t agino; xfs_ino_t ino; - xfs_perag_t *pag; - /* Calculate how much should be reserved for inodes to meet - * the max inode percentage. + /* + * Calculate how much should be reserved for inodes to meet + * the max inode percentage. Used only for inode32. */ if (mp->m_maxicount) { __uint64_t icount; @@ -602,54 +623,48 @@ xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount) max_metadata = agcount; } + /* Get the last possible inode in the filesystem */ agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); + + /* + * If user asked for no more than 32-bit inodes, and the fs is + * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter + * the allocator to accommodate the request. + */ + if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) + mp->m_flags |= XFS_MOUNT_32BITINODES; + else + mp->m_flags &= ~XFS_MOUNT_32BITINODES; for (index = 0; index < agcount; index++) { - ino = XFS_AGINO_TO_INO(mp, index, agino); + struct xfs_perag *pag; - if (ino > XFS_MAXINUMBER_32) { - pag = xfs_perag_get(mp, index); - pag->pagi_inodeok = 0; - pag->pagf_metadata = 0; - xfs_perag_put(pag); - continue; - } + ino = XFS_AGINO_TO_INO(mp, index, agino); pag = xfs_perag_get(mp, index); - pag->pagi_inodeok = 1; - maxagi++; - if (index < max_metadata) - pag->pagf_metadata = 1; - xfs_perag_put(pag); - } - mp->m_flags |= (XFS_MOUNT_32BITINODES | - XFS_MOUNT_SMALL_INUMS); - return maxagi; -} - -xfs_agnumber_t -xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount) -{ - xfs_agnumber_t index = 0; - - for (index = 0; index < agcount; index++) { - struct xfs_perag *pag; + if (mp->m_flags & XFS_MOUNT_32BITINODES) { + if (ino > XFS_MAXINUMBER_32) { + pag->pagi_inodeok = 0; + pag->pagf_metadata = 0; + } else { + pag->pagi_inodeok = 1; + maxagi++; + if (index < max_metadata) + pag->pagf_metadata = 1; + else + pag->pagf_metadata = 0; + } + } else { + pag->pagi_inodeok = 1; + pag->pagf_metadata = 0; + } - pag = xfs_perag_get(mp, index); - pag->pagi_inodeok = 1; - pag->pagf_metadata = 0; xfs_perag_put(pag); } - /* There is no need for lock protection on m_flags, - * the rw_semaphore of the VFS superblock is locked - * during mount/umount/remount operations, so this is - * enough to avoid concurency on the m_flags field - */ - mp->m_flags &= ~(XFS_MOUNT_32BITINODES | - XFS_MOUNT_SMALL_INUMS); - return index; + return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount; } STATIC int @@ -1166,6 +1181,27 @@ xfs_quiesce_attr( } STATIC int +xfs_test_remount_options( + struct super_block *sb, + struct xfs_mount *mp, + char *options) +{ + int error = 0; + struct xfs_mount *tmp_mp; + + tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL); + if (!tmp_mp) + return -ENOMEM; + + tmp_mp->m_super = sb; + error = xfs_parseargs(tmp_mp, options); + xfs_free_fsname(tmp_mp); + kfree(tmp_mp); + + return error; +} + +STATIC int xfs_fs_remount( struct super_block *sb, int *flags, @@ -1177,6 +1213,11 @@ xfs_fs_remount( char *p; int error; + /* First, check for complete junk; i.e. invalid options */ + error = xfs_test_remount_options(sb, mp, options); + if (error) + return error; + sync_filesystem(sb); while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1193,10 +1234,12 @@ xfs_fs_remount( mp->m_flags &= ~XFS_MOUNT_BARRIER; break; case Opt_inode64: - mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount); + mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); break; case Opt_inode32: - mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount); + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); break; default: /* @@ -1344,9 +1387,8 @@ xfs_finish_flags( */ if (xfs_sb_version_hascrc(&mp->m_sb) && (mp->m_flags & XFS_MOUNT_NOATTR2)) { - xfs_warn(mp, -"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", - MNTOPT_NOATTR2, MNTOPT_ATTR2); + xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. " + "attr2 is always enabled for V5 filesystems."); return -EINVAL; } @@ -1817,6 +1859,8 @@ init_xfs_fs(void) { int error; + xfs_check_ondisk_structs(); + printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 499058fea303..2dfb1ce4585f 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -65,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int); extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); -extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount); -extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount); +extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, + xfs_agnumber_t agcount); extern const struct export_operations xfs_export_operations; extern const struct xattr_handler *xfs_xattr_handlers[]; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 641d625eb334..6ced4f143494 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -18,10 +18,13 @@ #include "xfs.h" #include "xfs_sysfs.h" +#include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_stats.h" +#include "xfs_mount.h" struct xfs_sysfs_attr { struct attribute attr; @@ -45,16 +48,6 @@ to_attr(struct attribute *attr) #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr -/* - * xfs_mount kobject. This currently has no attributes and thus no need for show - * and store helpers. The mp kobject serves as the per-mount parent object that - * is identified by the fsname under sysfs. - */ - -struct kobj_type xfs_mp_ktype = { - .release = xfs_sysfs_release, -}; - STATIC ssize_t xfs_sysfs_object_show( struct kobject *kobject, @@ -83,6 +76,71 @@ static const struct sysfs_ops xfs_sysfs_ops = { .store = xfs_sysfs_object_store, }; +/* + * xfs_mount kobject. The mp kobject also serves as the per-mount parent object + * that is identified by the fsname under sysfs. + */ + +static inline struct xfs_mount * +to_mp(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + + return container_of(kobj, struct xfs_mount, m_kobj); +} + +#ifdef DEBUG + +STATIC ssize_t +fail_writes_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_mount *mp = to_mp(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val == 1) + mp->m_fail_writes = true; + else if (val == 0) + mp->m_fail_writes = false; + else + return -EINVAL; + + return count; +} + +STATIC ssize_t +fail_writes_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_mount *mp = to_mp(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0); +} +XFS_SYSFS_ATTR_RW(fail_writes); + +#endif /* DEBUG */ + +static struct attribute *xfs_mp_attrs[] = { +#ifdef DEBUG + ATTR_LIST(fail_writes), +#endif + NULL, +}; + +struct kobj_type xfs_mp_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_attrs = xfs_mp_attrs, +}; + #ifdef DEBUG /* debug */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 391d797cb53f..c8d58426008e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1296,11 +1296,7 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found); DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); -DEFINE_IOMAP_EVENT(xfs_gbmap_direct); -DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new); -DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update); -DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none); -DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio); +DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -1340,6 +1336,9 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize); DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof); +DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write); +DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); +DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 748b16aff45a..20c53666cb4b 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1028,6 +1028,8 @@ __xfs_trans_roll( struct xfs_trans_res tres; int error; + *committed = 0; + /* * Ensure that the inode is always logged. */ @@ -1082,6 +1084,6 @@ xfs_trans_roll( struct xfs_trans **tpp, struct xfs_inode *dp) { - int committed = 0; + int committed; return __xfs_trans_roll(tpp, dp, &committed); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 4643070d7cae..e7c49cf43fbc 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -133,7 +133,6 @@ typedef struct xfs_trans { * XFS transaction mechanism exported interfaces that are * actually macros. */ -#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) #if defined(DEBUG) || defined(XFS_WARN) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 4f18fd92ca13..d6c9c3e9e02b 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -497,6 +497,7 @@ xfsaild( long tout = 0; /* milliseconds */ current->flags |= PF_MEMALLOC; + set_freezable(); while (!kthread_should_stop()) { if (tout && tout <= 20) @@ -519,14 +520,14 @@ xfsaild( if (!xfs_ail_min(ailp) && ailp->xa_target == ailp->xa_target_prev) { spin_unlock(&ailp->xa_lock); - schedule(); + freezable_schedule(); tout = 0; continue; } spin_unlock(&ailp->xa_lock); if (tout) - schedule_timeout(msecs_to_jiffies(tout)); + freezable_schedule_timeout(msecs_to_jiffies(tout)); __set_current_state(TASK_RUNNING); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 75798412859a..8ee29ca132dc 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -155,7 +155,7 @@ xfs_trans_get_buf_map( ASSERT(xfs_buf_islocked(bp)); if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { xfs_buf_stale(bp); - XFS_BUF_DONE(bp); + bp->b_flags |= XBF_DONE; } ASSERT(bp->b_transp == tp); @@ -518,7 +518,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, * inside the b_bdstrat callback so that this won't get written to * disk. */ - XFS_BUF_DONE(bp); + bp->b_flags |= XBF_DONE; ASSERT(atomic_read(&bip->bli_refcount) > 0); bp->b_iodone = xfs_buf_iodone_callbacks; @@ -534,8 +534,8 @@ xfs_trans_log_buf(xfs_trans_t *tp, */ if (bip->bli_flags & XFS_BLI_STALE) { bip->bli_flags &= ~XFS_BLI_STALE; - ASSERT(XFS_BUF_ISSTALE(bp)); - XFS_BUF_UNSTALE(bp); + ASSERT(bp->b_flags & XBF_STALE); + bp->b_flags &= ~XBF_STALE; bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } @@ -600,7 +600,7 @@ xfs_trans_binval( * If the buffer is already invalidated, then * just return. */ - ASSERT(XFS_BUF_ISSTALE(bp)); + ASSERT(bp->b_flags & XBF_STALE); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 995170194df0..c3d547211d16 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -609,17 +609,20 @@ xfs_trans_dqresv( xfs_qcnt_t total_count; xfs_qcnt_t *resbcountp; xfs_quotainfo_t *q = mp->m_quotainfo; + struct xfs_def_quota *defq; xfs_dqlock(dqp); + defq = xfs_get_defquota(dqp, q); + if (flags & XFS_TRANS_DQ_RES_BLKS) { hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); if (!hardlimit) - hardlimit = q->qi_bhardlimit; + hardlimit = defq->bhardlimit; softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit); if (!softlimit) - softlimit = q->qi_bsoftlimit; + softlimit = defq->bsoftlimit; timer = be32_to_cpu(dqp->q_core.d_btimer); warns = be16_to_cpu(dqp->q_core.d_bwarns); warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; @@ -628,10 +631,10 @@ xfs_trans_dqresv( ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit); if (!hardlimit) - hardlimit = q->qi_rtbhardlimit; + hardlimit = defq->rtbhardlimit; softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit); if (!softlimit) - softlimit = q->qi_rtbsoftlimit; + softlimit = defq->rtbsoftlimit; timer = be32_to_cpu(dqp->q_core.d_rtbtimer); warns = be16_to_cpu(dqp->q_core.d_rtbwarns); warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; @@ -672,10 +675,10 @@ xfs_trans_dqresv( warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); if (!hardlimit) - hardlimit = q->qi_ihardlimit; + hardlimit = defq->ihardlimit; softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); if (!softlimit) - softlimit = q->qi_isoftlimit; + softlimit = defq->isoftlimit; if (hardlimit && total_count > hardlimit) { xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index b97f1df910ab..11a3af08b5c7 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -75,18 +75,10 @@ xfs_trans_ichgtime( tv = current_fs_time(inode->i_sb); - if ((flags & XFS_ICHGTIME_MOD) && - !timespec_equal(&inode->i_mtime, &tv)) { + if (flags & XFS_ICHGTIME_MOD) inode->i_mtime = tv; - ip->i_d.di_mtime.t_sec = tv.tv_sec; - ip->i_d.di_mtime.t_nsec = tv.tv_nsec; - } - if ((flags & XFS_ICHGTIME_CHG) && - !timespec_equal(&inode->i_ctime, &tv)) { + if (flags & XFS_ICHGTIME_CHG) inode->i_ctime = tv; - ip->i_d.di_ctime.t_sec = tv.tv_sec; - ip->i_d.di_ctime.t_nsec = tv.tv_nsec; - } } /* @@ -125,7 +117,7 @@ xfs_trans_log_inode( */ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && IS_I_VERSION(VFS_I(ip))) { - ip->i_d.di_changecount = ++VFS_I(ip)->i_version; + VFS_I(ip)->i_version++; flags |= XFS_ILOG_CORE; } |