diff options
Diffstat (limited to 'fs/xfs')
30 files changed, 254 insertions, 593 deletions
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 91f51637f8af..f7dda0c237b0 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -51,7 +51,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); -int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); +int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 29ffc0569ce1..2760bc3b2536 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4850,8 +4850,11 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0, + error = xfs_trans_reserve_quota_nblks(NULL, ip, + -((long)del->br_blockcount), 0, isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; ip->i_delayed_blks -= del->br_blockcount; if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index becc926c3e3d..0197590fa7d7 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -157,6 +157,9 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); +extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo, + const struct xfs_dir_ops *ops, + struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_freescan(struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, @@ -177,6 +180,8 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup); +extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); + extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index e526f5a5f0be..d478065b9544 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -505,8 +505,9 @@ xfs_dir2_data_freeremove( * Given a data block, reconstruct its bestfree map. */ void -xfs_dir2_data_freescan( - struct xfs_inode *dp, +xfs_dir2_data_freescan_int( + struct xfs_da_geometry *geo, + const struct xfs_dir_ops *ops, struct xfs_dir2_data_hdr *hdr, int *loghead) { @@ -516,7 +517,6 @@ xfs_dir2_data_freescan( struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ - struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || @@ -526,13 +526,13 @@ xfs_dir2_data_freescan( /* * Start by clearing the table. */ - bf = dp->d_ops->data_bestfree_p(hdr); + bf = ops->data_bestfree_p(hdr); memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)dp->d_ops->data_entry_p(hdr); + p = (char *)ops->data_entry_p(hdr); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { btp = xfs_dir2_block_tail_p(geo, hdr); @@ -559,12 +559,22 @@ xfs_dir2_data_freescan( else { dep = (xfs_dir2_data_entry_t *)p; ASSERT((char *)dep - (char *)hdr == - be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep))); - p += dp->d_ops->data_entsize(dep->namelen); + be16_to_cpu(*ops->data_entry_tag_p(dep))); + p += ops->data_entsize(dep->namelen); } } } +void +xfs_dir2_data_freescan( + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) +{ + return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops, + hdr, loghead); +} + /* * Initialize a data block at the given block number in the directory. * Give back the buffer for the created block. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index ef9f6ead96a4..d04547fcf274 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -21,7 +21,6 @@ struct dir_context; /* xfs_dir2.c */ -extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d45c03779dae..f272abff11e1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2344,7 +2344,8 @@ xfs_imap( imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + imap->im_boffset = (unsigned short)(offset << + mp->m_sb.sb_inodelog); return 0; } @@ -2372,7 +2373,7 @@ out_map: imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* * If the inode number maps to a block outside the bounds diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 9004665d679a..dd483e2767f7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -383,7 +383,7 @@ xfs_log_dinode_to_disk( static bool xfs_dinode_verify( struct xfs_mount *mp, - struct xfs_inode *ip, + xfs_ino_t ino, struct xfs_dinode *dip) { uint16_t flags; @@ -409,7 +409,7 @@ xfs_dinode_verify( if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) return false; - if (be64_to_cpu(dip->di_ino) != ip->i_ino) + if (be64_to_cpu(dip->di_ino) != ino) return false; if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; @@ -501,7 +501,7 @@ xfs_iread( return error; /* even unallocated inodes are verified */ - if (!xfs_dinode_verify(mp, ip, dip)) { + if (!xfs_dinode_verify(mp, ip->i_ino, dip)) { xfs_alert(mp, "%s: validation failed for inode %lld failed", __func__, ip->i_ino); diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 3cfe12a4f58a..6848a0afbce7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -58,8 +58,8 @@ struct xfs_icdinode { */ struct xfs_imap { xfs_daddr_t im_blkno; /* starting BB of inode chunk */ - ushort im_len; /* length in BBs of inode chunk */ - ushort im_boffset; /* inode offset in block in bytes */ + unsigned short im_len; /* length in BBs of inode chunk */ + unsigned short im_boffset; /* inode offset in block in bytes */ }; int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 083cdd6d6c28..7ae571f8e34a 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -481,8 +481,8 @@ static inline uint xfs_log_dinode_size(int version) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ + unsigned short blf_flags; /* misc state */ + unsigned short blf_len; /* number of blocks in this buf */ __int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 8e385f91d660..d9f65e2d5cc8 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -52,7 +52,7 @@ typedef struct xlog_recover { struct list_head r_itemq; /* q for items */ } xlog_recover_t; -#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) +#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) /* * This is the number of entries in the l_buf_cancel_table used during diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index e2e1106c9fad..ea45584a9913 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1016,4 +1016,3 @@ xfs_rtfree_extent( } return 0; } - diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a70aec910626..2580262e4ea0 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -262,6 +262,12 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + if (xfs_sb_version_hascrc(&mp->m_sb) && + sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { + xfs_notice(mp, "v5 SB sanity check failed"); + return -EFSCORRUPTED; + } + /* * Until this is fixed only page-sized or smaller data blocks work. */ @@ -338,13 +344,16 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); - if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { + if (sbp->sb_qflags & XFS_PQUOTA_ACCT && + sbp->sb_gquotino != NULLFSINO) { /* * In older version of superblock, on-disk superblock only * has sb_gquotino, and in-core superblock has both sb_gquotino * and sb_pquotino. But, only one of them is supported at any * point of time. So, if PQUOTA is set in disk superblock, - * copy over sb_gquotino to sb_pquotino. + * copy over sb_gquotino to sb_pquotino. The NULLFSINO test + * above is to make sure we don't do this twice and wipe them + * both out! */ sbp->sb_pquotino = sbp->sb_gquotino; sbp->sb_gquotino = NULLFSINO; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index f9a1076de911..717909f2f7b7 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -74,11 +74,14 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. * The sectorsize upper limit is due to sizeof(sb_sectsize). + * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes + * cannot be used. */ #define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ #define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ #define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) #define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_CRC_BLOCKSIZE (1 << (XFS_MIN_BLOCKSIZE_LOG + 1)) #define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ #define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ #define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4923b54da556..265000a09327 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -37,11 +37,6 @@ #include <linux/pagevec.h> #include <linux/writeback.h> -/* flags for direct write completions */ -#define XFS_DIO_FLAG_UNWRITTEN (1 << 0) -#define XFS_DIO_FLAG_APPEND (1 << 1) -#define XFS_DIO_FLAG_COW (1 << 2) - /* * structure owned by writepages passed to individual writepage calls */ @@ -1176,45 +1171,6 @@ xfs_vm_releasepage( } /* - * When we map a DIO buffer, we may need to pass flags to - * xfs_end_io_direct_write to tell it what kind of write IO we are doing. - * - * Note that for DIO, an IO to the highest supported file block offset (i.e. - * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 - * bit variable. Hence if we see this overflow, we have to assume that the IO is - * extending the file size. We won't know for sure until IO completion is run - * and the actual max write offset is communicated to the IO completion - * routine. - */ -static void -xfs_map_direct( - struct inode *inode, - struct buffer_head *bh_result, - struct xfs_bmbt_irec *imap, - xfs_off_t offset, - bool is_cow) -{ - uintptr_t *flags = (uintptr_t *)&bh_result->b_private; - xfs_off_t size = bh_result->b_size; - - trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, - ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW : - XFS_IO_OVERWRITE, imap); - - if (ISUNWRITTEN(imap)) { - *flags |= XFS_DIO_FLAG_UNWRITTEN; - set_buffer_defer_completion(bh_result); - } else if (is_cow) { - *flags |= XFS_DIO_FLAG_COW; - set_buffer_defer_completion(bh_result); - } - if (offset + size > i_size_read(inode) || offset + size < 0) { - *flags |= XFS_DIO_FLAG_APPEND; - set_buffer_defer_completion(bh_result); - } -} - -/* * If this is O_DIRECT or the mpage code calling tell them how large the mapping * is, so that we can avoid repeated get_blocks calls. * @@ -1254,52 +1210,12 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } -/* Bounce unaligned directio writes to the page cache. */ static int -xfs_bounce_unaligned_dio_write( - struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap) -{ - struct xfs_bmbt_irec irec; - xfs_fileoff_t delta; - bool shared; - bool x; - int error; - - irec = *imap; - if (offset_fsb > irec.br_startoff) { - delta = offset_fsb - irec.br_startoff; - irec.br_blockcount -= delta; - irec.br_startblock += delta; - irec.br_startoff = offset_fsb; - } - error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); - if (error) - return error; - - /* - * We're here because we're trying to do a directio write to a - * region that isn't aligned to a filesystem block. If any part - * of the extent is shared, fall back to buffered mode to handle - * the RMW. This is done by returning -EREMCHG ("remote addr - * changed"), which is caught further up the call stack. - */ - if (shared) { - trace_xfs_reflink_bounce_dio_write(ip, imap); - return -EREMCHG; - } - return 0; -} - -STATIC int -__xfs_get_blocks( +xfs_get_blocks( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, - int create, - bool direct, - bool dax_fault) + int create) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1310,10 +1226,8 @@ __xfs_get_blocks( int nimaps = 1; xfs_off_t offset; ssize_t size; - int new = 0; - bool is_cow = false; - BUG_ON(create && !direct); + BUG_ON(create); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1322,7 +1236,7 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && offset >= i_size_read(inode)) + if (offset >= i_size_read(inode)) return 0; /* @@ -1337,73 +1251,12 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - if (create && direct && xfs_is_reflink_inode(ip)) { - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); - ASSERT(!is_cow || !isnullstartblock(imap.br_startblock)); - } - - if (!is_cow) { - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); - /* - * Truncate an overwrite extent if there's a pending CoW - * reservation before the end of this extent. This - * forces us to come back to get_blocks to take care of - * the CoW. - */ - if (create && direct && nimaps && - imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(&imap)) - xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, - &imap); - } + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) goto out_unlock; - /* - * The only time we can ever safely find delalloc blocks on direct I/O - * is a dio write to post-eof speculative preallocation. All other - * scenarios are indicative of a problem or misuse (such as mixing - * direct and mapped I/O). - * - * The file may be unmapped by the time we get here so we cannot - * reliably fail the I/O based on mapping. Instead, fail the I/O if this - * is a read or a write within eof. Otherwise, carry on but warn as a - * precuation if the file happens to be mapped. - */ - if (direct && imap.br_startblock == DELAYSTARTBLOCK) { - if (!create || offset < i_size_read(VFS_I(ip))) { - WARN_ON_ONCE(1); - error = -EIO; - goto out_unlock; - } - WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping)); - } - - /* for DAX, we convert unwritten extents directly */ - if (create && - (!nimaps || - (imap.br_startblock == HOLESTARTBLOCK || - imap.br_startblock == DELAYSTARTBLOCK) || - (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; - - trace_xfs_get_blocks_alloc(ip, offset, size, - ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN - : XFS_IO_DELALLOC, &imap); - } else if (nimaps) { + if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); @@ -1413,12 +1266,6 @@ __xfs_get_blocks( goto out_unlock; } - if (IS_DAX(inode) && create) { - ASSERT(!ISUNWRITTEN(&imap)); - /* zeroing is not needed at a higher layer */ - new = 0; - } - /* trim mapping down to size requested */ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); @@ -1428,48 +1275,14 @@ __xfs_get_blocks( */ if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && - (create || !ISUNWRITTEN(&imap))) { - if (create && direct && !is_cow) { - error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, - &imap); - if (error) - return error; - } - + !ISUNWRITTEN(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); - if (ISUNWRITTEN(&imap)) - set_buffer_unwritten(bh_result); - /* direct IO needs special help */ - if (create) { - if (dax_fault) - ASSERT(!ISUNWRITTEN(&imap)); - else - xfs_map_direct(inode, bh_result, &imap, offset, - is_cow); - } - } /* * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ bh_result->b_bdev = xfs_find_bdev_for_inode(inode); - - /* - * If we previously allocated a block out beyond eof and we are now - * coming back to use it then we will need to flag it as new even if it - * has a disk address. - * - * With sub-block writes into unwritten extents we also need to mark - * the buffer as new so that the unwritten parts of the buffer gets - * correctly zeroed. - */ - if (create && - ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || - (offset >= i_size_read(inode)) || - (new || ISUNWRITTEN(&imap)))) - set_buffer_new(bh_result); - return 0; out_unlock: @@ -1477,110 +1290,6 @@ out_unlock: return error; } -int -xfs_get_blocks( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); -} - -int -xfs_get_blocks_direct( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); -} - -int -xfs_get_blocks_dax_fault( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); -} - -/* - * Complete a direct I/O write request. - * - * xfs_map_direct passes us some flags in the private data to tell us what to - * do. If no flags are set, then the write IO is an overwrite wholly within - * the existing allocated file size and so there is nothing for us to do. - * - * Note that in this case the completion can be called in interrupt context, - * whereas if we have flags set we will always be called in task context - * (i.e. from a workqueue). - */ -int -xfs_end_io_direct_write( - struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - uintptr_t flags = (uintptr_t)private; - int error = 0; - - trace_xfs_end_io_direct_write(ip, offset, size); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - if (size <= 0) - return size; - - /* - * The flags tell us whether we are doing unwritten extent conversions - * or an append transaction that updates the on-disk file size. These - * cases are the only cases where we should *potentially* be needing - * to update the VFS inode size. - */ - if (flags == 0) { - ASSERT(offset + size <= i_size_read(inode)); - return 0; - } - - /* - * We need to update the in-core inode size here so that we don't end up - * with the on-disk inode size being outside the in-core inode size. We - * have no other method of updating EOF for AIO, so always do it here - * if necessary. - * - * We need to lock the test/set EOF update as we can be racing with - * other IO completions here to update the EOF. Failing to serialise - * here can result in EOF moving backwards and Bad Things Happen when - * that occurs. - */ - spin_lock(&ip->i_flags_lock); - if (offset + size > i_size_read(inode)) - i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); - - if (flags & XFS_DIO_FLAG_COW) - error = xfs_reflink_end_cow(ip, offset, size); - if (flags & XFS_DIO_FLAG_UNWRITTEN) { - trace_xfs_end_io_direct_write_unwritten(ip, offset, size); - - error = xfs_iomap_write_unwritten(ip, offset, size); - } - if (flags & XFS_DIO_FLAG_APPEND) { - trace_xfs_end_io_direct_write_append(ip, offset, size); - - error = xfs_setfilesize(ip, offset, size); - } - - return error; -} - STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, @@ -1601,7 +1310,6 @@ xfs_vm_bmap( struct xfs_inode *ip = XFS_I(inode); trace_xfs_vm_bmap(XFS_I(inode)); - xfs_ilock(ip, XFS_IOLOCK_SHARED); /* * The swap code (ab-)uses ->bmap to get a block mapping and then @@ -1609,12 +1317,10 @@ xfs_vm_bmap( * that on reflinks inodes, so we have to skip out here. And yes, * 0 is the magic code for a bmap error.. */ - if (xfs_is_reflink_inode(ip)) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (xfs_is_reflink_inode(ip)) return 0; - } + filemap_write_and_wait(mapping); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index b3c6634f9518..cc174ec6c2fd 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -55,15 +55,6 @@ struct xfs_ioend { extern const struct address_space_operations xfs_address_space_operations; -int xfs_get_blocks(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); -int xfs_get_blocks_direct(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); -int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); - -int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private); int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0670a8bd5818..b9abce524c33 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1935,8 +1935,8 @@ xfs_swap_extents( * page cache safely. Once we have done this we can take the ilocks and * do the rest of the checks. */ - lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); + lock_flags = XFS_MMAPLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ @@ -2076,15 +2076,13 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); +out_unlock: xfs_iunlock(ip, lock_flags); xfs_iunlock(tip, lock_flags); + unlock_two_nondirectories(VFS_I(ip), VFS_I(tip)); return error; out_trans_cancel: xfs_trans_cancel(tp); - -out_unlock: - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); - return error; + goto out_unlock; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 29816981b50a..003a99b83bd8 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -677,7 +677,6 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; - xfs_ilock(dp, XFS_IOLOCK_SHARED); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) @@ -686,7 +685,6 @@ xfs_readdir( rval = xfs_dir2_block_getdents(&args, ctx); else rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return rval; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9a5d64b5f35a..b837ad8a63e4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -48,40 +48,6 @@ static const struct vm_operations_struct xfs_file_vm_ops; /* - * Locking primitives for read and write IO paths to ensure we consistently use - * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. - */ -static inline void -xfs_rw_ilock( - struct xfs_inode *ip, - int type) -{ - if (type & XFS_IOLOCK_EXCL) - inode_lock(VFS_I(ip)); - xfs_ilock(ip, type); -} - -static inline void -xfs_rw_iunlock( - struct xfs_inode *ip, - int type) -{ - xfs_iunlock(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -static inline void -xfs_rw_ilock_demote( - struct xfs_inode *ip, - int type) -{ - xfs_ilock_demote(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -/* * Clear the specified ranges to zero through either the pagecache or DAX. * Holes and unwritten extents will be left as-is as they already are zeroed. */ @@ -244,62 +210,21 @@ xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - loff_t isize = i_size_read(inode); + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); size_t count = iov_iter_count(to); - loff_t end = iocb->ki_pos + count - 1; - struct iov_iter data; - struct xfs_buftarg *target; - ssize_t ret = 0; + ssize_t ret; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ - if (XFS_IS_REALTIME_INODE(ip)) - target = ip->i_mount->m_rtdev_targp; - else - target = ip->i_mount->m_ddev_targp; - - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == isize) - return 0; - return -EINVAL; - } - file_accessed(iocb->ki_filp); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); - if (ret) - goto out_unlock; - - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } - - data = *to; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, NULL, NULL, 0); - if (ret >= 0) { - iocb->ki_pos += ret; - iov_iter_advance(to, ret); - } + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); -out_unlock: - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -317,9 +242,9 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); return ret; @@ -335,9 +260,9 @@ xfs_file_buffered_aio_read( trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -418,15 +343,18 @@ restart: if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock, true); + error = xfs_break_layouts(inode, iolock); if (error) return error; - /* For changing security info in file_remove_privs() we need i_mutex */ + /* + * For changing security info in file_remove_privs() we need i_rwsem + * exclusively. + */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); goto restart; } /* @@ -451,9 +379,9 @@ restart: spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); iov_iter_reexpand(from, count); } /* @@ -496,6 +424,58 @@ restart: return 0; } +static int +xfs_dio_write_end_io( + struct kiocb *iocb, + ssize_t size, + unsigned flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + loff_t offset = iocb->ki_pos; + bool update_size = false; + int error = 0; + + trace_xfs_end_io_direct_write(ip, offset, size); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (size <= 0) + return size; + + /* + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); + if (offset + size > i_size_read(inode)) { + i_size_write(inode, offset + size); + update_size = true; + } + spin_unlock(&ip->i_flags_lock); + + if (flags & IOMAP_DIO_COW) { + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + return error; + } + + if (flags & IOMAP_DIO_UNWRITTEN) + error = xfs_iomap_write_unwritten(ip, offset, size); + else if (update_size) + error = xfs_setfilesize(ip, offset, size); + + return error; +} + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -535,9 +515,7 @@ xfs_file_dio_aio_write( int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); - loff_t end; - struct iov_iter data; - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ @@ -559,29 +537,12 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); - end = iocb->ki_pos + count - 1; - - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); - if (ret) - goto out; - - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } /* * If we are doing unaligned IO, wait for all other IO to drain, @@ -591,7 +552,7 @@ xfs_file_dio_aio_write( if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -604,24 +565,9 @@ xfs_file_dio_aio_write( goto out; } - data = *from; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, xfs_end_io_direct_write, - NULL, DIO_ASYNC_EXTEND); - - /* see generic_file_direct_write() for why this is necessary */ - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } - - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(from, ret); - } + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); /* * No fallback to buffered IO on errors for XFS, direct IO will either @@ -643,7 +589,7 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -652,15 +598,13 @@ xfs_file_dax_write( count = iov_iter_count(from); trace_xfs_file_dax_write(ip, count, pos); - - ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); + ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } - out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return error ? error : ret; } @@ -677,7 +621,7 @@ xfs_file_buffered_aio_write( int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -721,7 +665,7 @@ write_retry: current->backing_dev_info = NULL; out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return ret; } @@ -797,7 +741,7 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; @@ -1465,7 +1409,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); @@ -1492,15 +1436,9 @@ xfs_filemap_fault( return xfs_filemap_page_mkwrite(vma, vmf); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) { - /* - * we do not want to trigger unwritten extent conversion on read - * faults - that is unnecessary overhead and would also require - * changes to xfs_get_blocks_direct() to map unwritten extent - * ioend for conversion on read-only mappings. - */ - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); - } else + if (IS_DAX(inode)) + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); + else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1536,7 +1474,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); + ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9c3e5c6ddf20..ff4d6311c7f4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -70,8 +70,6 @@ xfs_inode_alloc( ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - /* initialise the xfs inode */ ip->i_ino = ino; ip->i_mount = mp; @@ -394,8 +392,8 @@ xfs_iget_cache_hit( xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); + init_rwsem(&inode->i_rwsem); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 512ff13ed66a..b9557795eb74 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -142,31 +142,31 @@ xfs_ilock_attr_map_shared( } /* - * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and - * the i_lock. This routine allows various combinations of the locks to be - * obtained. + * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 + * multi-reader locks: i_mmap_lock and the i_lock. This routine allows + * various combinations of the locks to be obtained. * * The 3 locks should always be ordered so that the IO lock is obtained first, * the mmap lock second and the ilock last in order to prevent deadlock. * * Basic locking order: * - * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock * * mmap_sem locking order: * - * i_iolock -> page lock -> mmap_sem + * i_rwsem -> page lock -> mmap_sem * mmap_sem -> i_mmap_lock -> page_lock * * The difference in mmap_sem locking order mean that we cannot hold the * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can * fault in pages during copy in/out (for buffered IO) or require the mmap_sem * in get_user_pages() to map the user pages into the kernel address space for - * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because * page faults already hold the mmap_sem. * * Hence to serialise fully against both syscall and mmap based IO, we need to - * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both * taken in places where we need to invalidate the page cache in a race * free manner (e.g. truncate, hole punch and other extent manipulation * functions). @@ -191,10 +191,13 @@ xfs_ilock( (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); - if (lock_flags & XFS_IOLOCK_EXCL) - mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_IOLOCK_SHARED) - mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_IOLOCK_EXCL) { + down_write_nested(&VFS_I(ip)->i_rwsem, + XFS_IOLOCK_DEP(lock_flags)); + } else if (lock_flags & XFS_IOLOCK_SHARED) { + down_read_nested(&VFS_I(ip)->i_rwsem, + XFS_IOLOCK_DEP(lock_flags)); + } if (lock_flags & XFS_MMAPLOCK_EXCL) mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); @@ -240,10 +243,10 @@ xfs_ilock_nowait( ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) { - if (!mrtryupdate(&ip->i_iolock)) + if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) goto out; } else if (lock_flags & XFS_IOLOCK_SHARED) { - if (!mrtryaccess(&ip->i_iolock)) + if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) goto out; } @@ -271,9 +274,9 @@ out_undo_mmaplock: mrunlock_shared(&ip->i_mmaplock); out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); + up_write(&VFS_I(ip)->i_rwsem); else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); + up_read(&VFS_I(ip)->i_rwsem); out: return 0; } @@ -310,9 +313,9 @@ xfs_iunlock( ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); + up_write(&VFS_I(ip)->i_rwsem); else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); + up_read(&VFS_I(ip)->i_rwsem); if (lock_flags & XFS_MMAPLOCK_EXCL) mrunlock_excl(&ip->i_mmaplock); @@ -345,7 +348,7 @@ xfs_ilock_demote( if (lock_flags & XFS_MMAPLOCK_EXCL) mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) - mrdemote(&ip->i_iolock); + downgrade_write(&VFS_I(ip)->i_rwsem); trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } @@ -370,8 +373,9 @@ xfs_isilocked( if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !!ip->i_iolock.mr_writer; - return rwsem_is_locked(&ip->i_iolock.mr_lock); + return !debug_locks || + lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); + return rwsem_is_locked(&VFS_I(ip)->i_rwsem); } ASSERT(0); @@ -421,11 +425,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); - ASSERT(xfs_lockdep_subclass_ok(subclass + - XFS_IOLOCK_PARENT_VAL)); class += subclass << XFS_IOLOCK_SHIFT; - if (lock_mode & XFS_IOLOCK_PARENT) - class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT; } if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { @@ -477,8 +477,6 @@ xfs_lock_inodes( XFS_ILOCK_EXCL)); ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | XFS_ILOCK_SHARED))); - ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) || - inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1); ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || @@ -581,10 +579,8 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { - ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); - ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); @@ -715,7 +711,6 @@ xfs_lookup( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - xfs_ilock(dp, XFS_IOLOCK_SHARED); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); if (error) goto out_unlock; @@ -724,14 +719,12 @@ xfs_lookup( if (error) goto out_free_name; - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return 0; out_free_name: if (ci_name) kmem_free(ci_name->name); out_unlock: - xfs_iunlock(dp, XFS_IOLOCK_SHARED); *ipp = NULL; return error; } @@ -1215,8 +1208,7 @@ xfs_create( if (error) goto out_release_inode; - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | - XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; xfs_defer_init(&dfops, &first_block); @@ -1252,7 +1244,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1325,7 +1317,7 @@ xfs_create( xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } @@ -1466,11 +1458,10 @@ xfs_link( if (error) goto std_return; - xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow hard link @@ -2577,10 +2568,9 @@ xfs_remove( goto std_return; } - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* @@ -2961,12 +2951,6 @@ xfs_rename( * whether the target directory is the same as the source * directory, we can lock from 2 to 4 inodes. */ - if (!new_parent) - xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - else - xfs_lock_two_inodes(src_dp, target_dp, - XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* @@ -2974,9 +2958,9 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 71e8a81c91a3..10dcf27b4c85 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -56,7 +56,6 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ - mrlock_t i_iolock; /* inode IO lock */ mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ spinlock_t i_flags_lock; /* inode i_flags lock */ @@ -333,7 +332,7 @@ static inline void xfs_ifunlock(struct xfs_inode *ip) * IOLOCK values * * 0-3 subclass value - * 4-7 PARENT subclass values + * 4-7 unused * * MMAPLOCK values * @@ -348,10 +347,8 @@ static inline void xfs_ifunlock(struct xfs_inode *ip) * */ #define XFS_IOLOCK_SHIFT 16 -#define XFS_IOLOCK_PARENT_VAL 4 -#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1) +#define XFS_IOLOCK_MAX_SUBCLASS 3 #define XFS_IOLOCK_DEP_MASK 0x000f0000 -#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT) #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a39197501a7c..fc563b82aea6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -639,7 +639,7 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 15a83813b708..0d147428971e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -950,6 +950,19 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && ISUNWRITTEN(imap)); } +static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) +{ + /* + * COW writes will allocate delalloc space, so we need to make sure + * to take the lock exclusively here. + */ + if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) + return true; + if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE)) + return true; + return false; +} + static int xfs_file_iomap_begin( struct inode *inode, @@ -969,18 +982,14 @@ xfs_file_iomap_begin( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && - !xfs_get_extsz_hint(ip)) { + if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && + !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ return xfs_file_iomap_begin_delay(inode, offset, length, flags, iomap); } - /* - * COW writes will allocate delalloc space, so we need to make sure - * to take the lock exclusively here. - */ - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + if (need_excl_ilock(ip, flags)) { lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); } else { @@ -993,17 +1002,41 @@ xfs_file_iomap_begin( offset_fsb = XFS_B_TO_FSBT(mp, offset); end_fsb = XFS_B_TO_FSB(mp, offset + length); + if (xfs_is_reflink_inode(ip) && + (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) { + shared = xfs_reflink_find_cow_mapping(ip, offset, &imap); + if (shared) { + xfs_iunlock(ip, lockmode); + goto alloc_done; + } + ASSERT(!isnullstartblock(imap.br_startblock)); + } + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) goto out_unlock; - if (flags & IOMAP_REPORT) { + if ((flags & IOMAP_REPORT) || + (xfs_is_reflink_inode(ip) && + (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) { /* Trim the mapping to the nearest shared extent boundary. */ error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); if (error) goto out_unlock; + + /* + * We're here because we're trying to do a directio write to a + * region that isn't aligned to a filesystem block. If the + * extent is shared, fall back to buffered mode to handle the + * RMW. + */ + if (!(flags & IOMAP_REPORT) && shared) { + trace_xfs_reflink_bounce_dio_write(ip, &imap); + error = -EREMCHG; + goto out_unlock; + } } if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { @@ -1038,6 +1071,7 @@ xfs_file_iomap_begin( if (error) return error; +alloc_done: iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); } else { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 405a65cd9d6b..c962999a87ab 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -983,15 +983,13 @@ xfs_vn_setattr( struct xfs_inode *ip = XFS_I(d_inode(dentry)); uint iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); - error = xfs_break_layouts(d_inode(dentry), &iolock, true); - if (!error) { - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; + error = xfs_break_layouts(d_inode(dentry), &iolock); + if (error) + return error; - error = xfs_vn_setattr_size(dentry, iattr); - } - xfs_iunlock(ip, iolock); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_setattr_size(ip, iattr); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { error = xfs_vn_setattr_nonsize(dentry, iattr); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 56b7a2f6aaf2..4a98762ec8b4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2025,7 +2025,7 @@ xlog_peek_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, - ushort flags) + unsigned short flags) { struct list_head *bucket; struct xfs_buf_cancel *bcp; @@ -2065,7 +2065,7 @@ xlog_check_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, - ushort flags) + unsigned short flags) { struct xfs_buf_cancel *bcp; diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 93a7aafa56d6..2f2dc3c09ad0 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -32,8 +32,7 @@ int xfs_break_layouts( struct inode *inode, - uint *iolock, - bool with_imutex) + uint *iolock) { struct xfs_inode *ip = XFS_I(inode); int error; @@ -42,12 +41,8 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); - if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) - inode_unlock(inode); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; - if (with_imutex) - inode_lock(inode); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index e8339f74966b..b587cb99b2b7 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); +int xfs_break_layouts(struct inode *inode, uint *iolock); #else static inline int -xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) +xfs_break_layouts(struct inode *inode, uint *iolock) { return 0; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index becf2465dd23..88fd03c66e99 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1251,13 +1251,11 @@ xfs_reflink_remap_range( return -EIO; /* Lock both files against IO */ - if (same_inode) { - xfs_ilock(src, XFS_IOLOCK_EXCL); + lock_two_nondirectories(inode_in, inode_out); + if (same_inode) xfs_ilock(src, XFS_MMAPLOCK_EXCL); - } else { - xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + else xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); - } /* Don't touch certain kinds of inodes */ ret = -EPERM; @@ -1402,11 +1400,9 @@ xfs_reflink_remap_range( out_unlock: xfs_iunlock(src, XFS_MMAPLOCK_EXCL); - xfs_iunlock(src, XFS_IOLOCK_EXCL); - if (src->i_ino != dest->i_ino) { + if (!same_inode) xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - xfs_iunlock(dest, XFS_IOLOCK_EXCL); - } + unlock_two_nondirectories(inode_in, inode_out); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ade4691e3f74..563d1d146b8c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -943,7 +943,7 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 58142aeeeea6..f2cb45ed1d54 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -238,8 +238,7 @@ xfs_symlink( if (error) goto out_release_inode; - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | - XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; /* @@ -287,7 +286,7 @@ xfs_symlink( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = false; /* @@ -412,7 +411,7 @@ out_release_inode: xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } |