From ec56b1f1fdc69599963574ce94cc5693d535dd64 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:18:18 +1000 Subject: xfs: mmap lock needs to be inside freeze protection Lock ordering for the new mmap lock needs to be: mmap_sem sb_start_pagefault i_mmap_lock page lock Right now xfs_vm_page_mkwrite gets this the wrong way around, While technically it cannot deadlock due to the current freeze ordering, it's still a landmine that might explode if we change anything in future. Hence we need to nest the locks correctly. Signed-off-by: Dave Chinner Reviewed-by: Jan Kara Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8121e75352ee..0b4e79fd8d05 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1487,15 +1487,20 @@ xfs_filemap_page_mkwrite( struct vm_fault *vmf) { struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; + int ret; trace_xfs_filemap_page_mkwrite(ip); + sb_start_pagefault(VFS_I(ip)->i_sb); + file_update_time(vma->vm_file); xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + sb_end_pagefault(VFS_I(ip)->i_sb); - return error; + return block_page_mkwrite_return(ret); } const struct file_operations xfs_file_operations = { -- cgit v1.2.3 From e842f2903908934187af7232fb5b21da527d1757 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:18:18 +1000 Subject: dax: don't abuse get_block mapping for endio callbacks dax_fault() currently relies on the get_block callback to attach an io completion callback to the mapping buffer head so that it can run unwritten extent conversion after zeroing allocated blocks. Instead of this hack, pass the conversion callback directly into dax_fault() similar to the get_block callback. When the filesystem allocates unwritten extents, it will set the buffer_unwritten() flag, and hence the dax_fault code can call the completion function in the contexts where it is necessary without overloading the mapping buffer head. Note: The changes to ext4 to use this interface are suspect at best. In fact, the way ext4 did this end_io assignment in the first place looks suspect because it only set a completion callback when there wasn't already some other write() call taking place on the same inode. The ext4 end_io code looks rather intricate and fragile with all it's reference counting and passing to different contexts for modification via inode private pointers that aren't protected by locks... Signed-off-by: Dave Chinner Acked-by: Jan Kara Signed-off-by: Dave Chinner --- fs/dax.c | 21 +++++++++++++++------ fs/ext2/file.c | 4 ++-- fs/ext4/file.c | 16 ++++++++++++++-- fs/ext4/inode.c | 21 +++++++-------------- include/linux/fs.h | 6 ++++-- 5 files changed, 42 insertions(+), 26 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 6f65f00e58ec..4bb5b7cd5dfd 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -309,14 +309,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, out: i_mmap_unlock_read(mapping); - if (bh->b_end_io) - bh->b_end_io(bh, 1); - return error; } static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) + get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -417,7 +414,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, page_cache_release(page); } + /* + * If we successfully insert the new mapping over an unwritten extent, + * we need to ensure we convert the unwritten extent. If there is an + * error inserting the mapping, the filesystem needs to leave it as + * unwritten to prevent exposure of the stale underlying data to + * userspace, but we still need to call the completion function so + * the private resources on the mapping buffer can be released. We + * indicate what the callback should do via the uptodate variable, same + * as for normal BH based IO completions. + */ error = dax_insert_mapping(inode, &bh, vma, vmf); + if (buffer_unwritten(&bh)) + complete_unwritten(&bh, !error); out: if (error == -ENOMEM) @@ -445,7 +454,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) + get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -454,7 +463,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = do_dax_fault(vma, vmf, get_block); + result = do_dax_fault(vma, vmf, get_block, complete_unwritten); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 3a0a6c6406d0..3b57c9f83c9b 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -28,12 +28,12 @@ #ifdef CONFIG_FS_DAX static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext2_get_block); + return dax_fault(vma, vmf, ext2_get_block, NULL); } static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext2_get_block); + return dax_mkwrite(vma, vmf, ext2_get_block, NULL); } static const struct vm_operations_struct ext2_dax_vm_ops = { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0613c256c344..f713cfcc43a2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -192,15 +192,27 @@ out: } #ifdef CONFIG_FS_DAX +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext4_get_block); + return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten); /* Is this the right get_block? */ } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block); + return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten); } static const struct vm_operations_struct ext4_dax_vm_ops = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 55b187c3bac1..7c38ed3494cb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -656,18 +656,6 @@ has_zeroout: return retval; } -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ - struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; - int err; - if (!uptodate) - return; - WARN_ON(!buffer_unwritten(bh)); - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} - /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + if (IS_DAX(inode) && buffer_unwritten(bh)) { + /* + * dgc: I suspect unwritten conversion on ext4+DAX is + * fundamentally broken here when there are concurrent + * read/write in progress on this inode. + */ + WARN_ON_ONCE(io_end); bh->b_assoc_map = inode->i_mapping; bh->b_private = (void *)(unsigned long)iblock; - bh->b_end_io = ext4_end_io_unwritten; } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); diff --git a/include/linux/fs.h b/include/linux/fs.h index 35ec87e490b1..c9b4cca9e08d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); +typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 @@ -2627,9 +2628,10 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, int dax_clear_blocks(struct inode *, sector_t block, long size); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) +#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, -- cgit v1.2.3 From ce5c5d554dc47a4fb4360c84b72231fea081e7a0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:18:18 +1000 Subject: dax: expose __dax_fault for filesystems with locking constraints Some filesystems cannot call dax_fault() directly because they have different locking and/or allocation constraints in the page fault IO path. To handle this, we need to follow the same model as the generic block_page_mkwrite code, where the internals are exposed via __block_page_mkwrite() so that filesystems can wrap the correct locking and operations around the outside. This is loosely based on a patch originally from Matthew Willcox. Unlike the original patch, it does not change ext4 code, error returns or unwritten extent conversion handling. It also adds a __dax_mkwrite() wrapper for .page_mkwrite implementations to do the right thing, too. Signed-off-by: Dave Chinner Reviewed-by: Jan Kara Signed-off-by: Dave Chinner --- fs/dax.c | 15 +++++++++++++-- include/linux/fs.h | 5 ++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 4bb5b7cd5dfd..99b5fbc38992 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -312,7 +312,17 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, return error; } -static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, +/** + * __dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. __dax_fault() assumes the caller has done all + * the necessary locking for the page fault to proceed successfully. + */ +int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; @@ -443,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } goto out; } +EXPORT_SYMBOL(__dax_fault); /** * dax_fault - handle a page fault on a DAX file @@ -463,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = do_dax_fault(vma, vmf, get_block, complete_unwritten); + result = __dax_fault(vma, vmf, get_block, complete_unwritten); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); diff --git a/include/linux/fs.h b/include/linux/fs.h index c9b4cca9e08d..5784377e7c56 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2630,8 +2630,11 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); +int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) +#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) +#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, -- cgit v1.2.3 From 6b698edeeef00c127d73501b386590299f01327a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:18:53 +1000 Subject: xfs: add DAX file operations support Add the initial support for DAX file operations to XFS. This includes the necessary block allocation and mmap page fault hooks for DAX to function. Note that there are changes to the splice interfaces to ensure that for DAX splice avoids direct page cache manipulations and instead takes the DAX IO paths for read/write operations. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 116 ++++++++++++++++++++++++++++++++++++++--------------- fs/xfs/xfs_aops.h | 7 +++- fs/xfs/xfs_file.c | 118 +++++++++++++++++++++++++++++++----------------------- 3 files changed, 158 insertions(+), 83 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a56960dd1684..1d195e80d62e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1349,7 +1349,7 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - int direct) + bool direct) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1414,6 +1414,7 @@ __xfs_get_blocks( if (error) return error; new = 1; + } else { /* * Delalloc reservations do not require a transaction, @@ -1508,49 +1509,29 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); + return __xfs_get_blocks(inode, iblock, bh_result, create, false); } -STATIC int +int xfs_get_blocks_direct( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); + return __xfs_get_blocks(inode, iblock, bh_result, create, true); } -/* - * Complete a direct I/O write request. - * - * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. - * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite - * wholly within the EOF and so there is nothing for us to do. Note that in this - * case the completion can be called in interrupt context, whereas if we have an - * ioend we will always be called in task context (i.e. from a workqueue). - */ -STATIC void -xfs_end_io_direct_write( - struct kiocb *iocb, +static void +__xfs_end_io_direct_write( + struct inode *inode, + struct xfs_ioend *ioend, loff_t offset, - ssize_t size, - void *private) + ssize_t size) { - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct xfs_ioend *ioend = private; + struct xfs_mount *mp = XFS_I(inode)->i_mount; - trace_xfs_gbmap_direct_endio(ip, offset, size, - ioend ? ioend->io_type : 0, NULL); - - if (!ioend) { - ASSERT(offset + size <= i_size_read(inode)); - return; - } - - if (XFS_FORCED_SHUTDOWN(mp)) + if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) goto out_end_io; /* @@ -1587,10 +1568,10 @@ xfs_end_io_direct_write( * here can result in EOF moving backwards and Bad Things Happen when * that occurs. */ - spin_lock(&ip->i_flags_lock); + spin_lock(&XFS_I(inode)->i_flags_lock); if (offset + size > i_size_read(inode)) i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); + spin_unlock(&XFS_I(inode)->i_flags_lock); /* * If we are doing an append IO that needs to update the EOF on disk, @@ -1607,6 +1588,75 @@ out_end_io: return; } +/* + * Complete a direct I/O write request. + * + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). + */ +STATIC void +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_ioend *ioend = private; + + trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); + return; + } + + __xfs_end_io_direct_write(inode, ioend, offset, size); +} + +/* + * For DAX we need a mapping buffer callback for unwritten extent conversion + * when page faults allocate blocks and then zero them. Note that in this + * case the mapping indicated by the ioend may extend beyond EOF. We most + * definitely do not want to extend EOF here, so we trim back the ioend size to + * EOF. + */ +#ifdef CONFIG_FS_DAX +void +xfs_end_io_dax_write( + struct buffer_head *bh, + int uptodate) +{ + struct xfs_ioend *ioend = bh->b_private; + struct inode *inode = ioend->io_inode; + ssize_t size = ioend->io_size; + + ASSERT(IS_DAX(ioend->io_inode)); + + /* if there was an error zeroing, then don't convert it */ + if (!uptodate) + ioend->io_error = -EIO; + + /* + * Trim update to EOF, so we don't extend EOF during unwritten extent + * conversion of partial EOF blocks. + */ + spin_lock(&XFS_I(inode)->i_flags_lock); + if (ioend->io_offset + size > i_size_read(inode)) + size = i_size_read(inode) - ioend->io_offset; + spin_unlock(&XFS_I(inode)->i_flags_lock); + + __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); + +} +#else +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } +#endif + STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index ac644e0137a4..86afd1ac7895 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -53,7 +53,12 @@ typedef struct xfs_ioend { } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; -extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); + +int xfs_get_blocks(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +int xfs_get_blocks_direct(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0b4e79fd8d05..a629dce4903e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -284,7 +284,7 @@ xfs_file_read_iter( if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; - if (unlikely(ioflags & XFS_IO_ISDIRECT)) { + if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -378,7 +378,11 @@ xfs_file_splice_read( trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + /* for dax, we need to avoid the page cache */ + if (IS_DAX(VFS_I(ip))) + ret = default_file_splice_read(infilp, ppos, pipe, count, flags); + else + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -672,7 +676,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if ((pos | count) & target->bt_logical_sectormask) + if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask)) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -758,8 +762,11 @@ xfs_file_dio_aio_write( out: xfs_rw_iunlock(ip, iolock); - /* No fallback to buffered IO on errors for XFS. */ - ASSERT(ret < 0 || ret == count); + /* + * No fallback to buffered IO on errors for XFS. DAX can result in + * partial writes, but direct IO will either complete fully or fail. + */ + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); return ret; } @@ -842,7 +849,7 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1063,17 +1070,6 @@ xfs_file_readdir( return xfs_readdir(ip, ctx, bufsize); } -STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) -{ - vma->vm_ops = &xfs_file_vm_ops; - - file_accessed(filp); - return 0; -} - /* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). @@ -1454,26 +1450,11 @@ xfs_file_llseek( * ordering of: * * mmap_sem (MM) - * i_mmap_lock (XFS - truncate serialisation) - * page_lock (MM) - * i_lock (XFS - extent map serialisation) + * sb_start_pagefault(vfs, freeze) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) */ -STATIC int -xfs_filemap_fault( - struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; - - trace_xfs_filemap_fault(ip); - - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = filemap_fault(vma, vmf); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - - return error; -} /* * mmap()d file has taken write protection fault and is being made writable. We @@ -1486,21 +1467,66 @@ xfs_filemap_page_mkwrite( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + struct inode *inode = file_inode(vma->vm_file); int ret; - trace_xfs_filemap_page_mkwrite(ip); + trace_xfs_filemap_page_mkwrite(XFS_I(inode)); - sb_start_pagefault(VFS_I(ip)->i_sb); + sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + + if (IS_DAX(inode)) { + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, + xfs_end_io_dax_write); + } else { + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = block_page_mkwrite_return(ret); + } + + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + + return ret; +} + +STATIC int +xfs_filemap_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file)); + int ret; - ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + trace_xfs_filemap_fault(ip); + + /* DAX can shortcut the normal fault path on write faults! */ + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip))) + return xfs_filemap_page_mkwrite(vma, vmf); + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + ret = filemap_fault(vma, vmf); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - sb_end_pagefault(VFS_I(ip)->i_sb); - return block_page_mkwrite_return(ret); + return ret; +} + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, +}; + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + file_accessed(filp); + vma->vm_ops = &xfs_file_vm_ops; + if (IS_DAX(file_inode(filp))) + vma->vm_flags |= VM_MIXEDMAP; + return 0; } const struct file_operations xfs_file_operations = { @@ -1531,9 +1557,3 @@ const struct file_operations xfs_dir_file_operations = { #endif .fsync = xfs_dir_fsync, }; - -static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = xfs_filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = xfs_filemap_page_mkwrite, -}; -- cgit v1.2.3 From 4f69f578a87d39c20b1ff70005a125e4594c3de8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:19:08 +1000 Subject: xfs: add DAX block zeroing support Add initial support for DAX block zeroing operations to XFS. DAX cannot use buffered IO through the page cache for zeroing, nor do we need to issue IO for uncached block zeroing. In both cases, we can simply call out to the dax block zeroing function. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 23 +++++++++++++++++++---- fs/xfs/xfs_file.c | 45 +++++++++++++++++++++++++++------------------ 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a52bbd3abc7d..4a2965515ca8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1133,14 +1133,29 @@ xfs_zero_remaining_bytes( break; ASSERT(imap.br_blockcount >= 1); ASSERT(imap.br_startoff == offset_fsb); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) { + /* skip the entire extent */ + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + + imap.br_blockcount) - 1; + continue; + } + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; if (lastoffset > endoff) lastoffset = endoff; - if (imap.br_startblock == HOLESTARTBLOCK) - continue; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - if (imap.br_state == XFS_EXT_UNWRITTEN) + + /* DAX can just zero the backing device directly */ + if (IS_DAX(VFS_I(ip))) { + error = dax_zero_page_range(VFS_I(ip), offset, + lastoffset - offset + 1, + xfs_get_blocks_direct); + if (error) + return error; continue; + } error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a629dce4903e..cfd9b4f5ad6e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -79,14 +79,15 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero + * xfs_iozero clears the specified range supplied via the page cache (except in + * the DAX case). Writes through the page cache will allocate blocks over holes, + * though the callers usually map the holes first and avoid them. If a block is + * not completely zeroed, then it will be read from disk before being partially + * zeroed. * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. + * In the DAX case, we can just directly write to the underlying pages. This + * will not allocate blocks, but will avoid holes and unwritten extents and so + * not do unnecessary work. */ int xfs_iozero( @@ -96,7 +97,8 @@ xfs_iozero( { struct page *page; struct address_space *mapping; - int status; + int status = 0; + mapping = VFS_I(ip)->i_mapping; do { @@ -108,20 +110,27 @@ xfs_iozero( if (bytes > count) bytes = count; - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; + if (IS_DAX(VFS_I(ip))) { + status = dax_zero_page_range(VFS_I(ip), pos, bytes, + xfs_get_blocks_direct); + if (status) + break; + } else { + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; - zero_user(page, offset, bytes); + zero_user(page, offset, bytes); - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ + status = pagecache_write_end(NULL, mapping, pos, bytes, + bytes, page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + status = 0; + } pos += bytes; count -= bytes; - status = 0; } while (count); return (-status); -- cgit v1.2.3 From 9969441f9f86a8a7de8c36514fa789e5f5d83145 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:19:10 +1000 Subject: xfs: add DAX truncate support When we truncate a DAX file, we need to call through the DAX page truncation path rather than through block_truncate_page() so that mappings and block zeroing are all handled correctly. Otherwise, truncate does not need to change. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f4cd7204e236..0994f95c368f 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -851,7 +851,11 @@ xfs_setattr_size( * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); + else + error = block_truncate_page(inode->i_mapping, newsize, + xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); -- cgit v1.2.3 From 6e1ba0bcb84b3f97616feb07c27f974509ba57be Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:19:15 +1000 Subject: xfs: add DAX IO path support DAX does not do buffered IO (can't buffer direct access!) and hence all read/write IO is vectored through the direct IO path. Hence we need to add the DAX IO path callouts to the direct IO infrastructure. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 1d195e80d62e..e5e9fc23f230 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1657,6 +1657,29 @@ xfs_end_io_dax_write( void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } #endif +static inline ssize_t +xfs_vm_do_dio( + struct inode *inode, + struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset, + void (*endio)(struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private), + int flags) +{ + struct block_device *bdev; + + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, + xfs_get_blocks_direct, endio, 0); + + bdev = xfs_find_bdev_for_inode(inode); + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + xfs_get_blocks_direct, endio, NULL, flags); +} + STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, @@ -1664,16 +1687,11 @@ xfs_vm_direct_IO( loff_t offset) { struct inode *inode = iocb->ki_filp->f_mapping->host; - struct block_device *bdev = xfs_find_bdev_for_inode(inode); - if (iov_iter_rw(iter) == WRITE) { - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, - xfs_end_io_direct_write, NULL, - DIO_ASYNC_EXTEND); - } - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, NULL, NULL, 0); + if (iov_iter_rw(iter) == WRITE) + return xfs_vm_do_dio(inode, iocb, iter, offset, + xfs_end_io_direct_write, DIO_ASYNC_EXTEND); + return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); } /* -- cgit v1.2.3 From cbe4dab119f211ff6642d617f541087894e99e4f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 4 Jun 2015 09:19:18 +1000 Subject: xfs: add initial DAX support Add initial DAX support to XFS. To do this we need a new mount option to turn DAX on filesystem, and we need to propagate this into the inode flags whenever an inode is instantiated so that the per-inode checks throughout the code Do The Right Thing. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 24 ++++++++++++------------ fs/xfs/xfs_mount.h | 2 ++ fs/xfs/xfs_super.c | 25 +++++++++++++++++++++++-- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 0994f95c368f..3e8d32d41f35 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1195,22 +1195,22 @@ xfs_diflags_to_iflags( struct inode *inode, struct xfs_inode *ip) { - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + uint16_t flags = ip->i_d.di_flags; + + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | + S_NOATIME | S_DAX); + + if (flags & XFS_DIFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + if (flags & XFS_DIFLAG_APPEND) inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + if (flags & XFS_DIFLAG_SYNC) inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; + /* XXX: Also needs an on-disk per inode flag! */ + if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + inode->i_flags |= S_DAX; } /* diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8c995a2ccb6f..cd44e88efa53 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -179,6 +179,8 @@ typedef struct xfs_mount { allocator */ #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ +#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ + /* * Default minimum read and write sizes. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 858e1e62bbaa..1fb16562c159 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ +#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */ + /* * Table driven mount option parser. * @@ -363,6 +365,10 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { mp->m_flags &= ~XFS_MOUNT_DISCARD; +#ifdef CONFIG_FS_DAX + } else if (!strcmp(this_char, MNTOPT_DAX)) { + mp->m_flags |= XFS_MOUNT_DAX; +#endif } else { xfs_warn(mp, "unknown mount option [%s].", this_char); return -EINVAL; @@ -452,8 +458,8 @@ done: } struct proc_xfs_info { - int flag; - char *str; + uint64_t flag; + char *str; }; STATIC int @@ -474,6 +480,7 @@ xfs_showargs( { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, + { XFS_MOUNT_DAX, "," MNTOPT_DAX }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -1507,6 +1514,20 @@ xfs_fs_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= MS_I_VERSION; + if (mp->m_flags & XFS_MOUNT_DAX) { + xfs_warn(mp, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + if (sb->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "Filesystem block size invalid for DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + xfs_alert(mp, + "Block device does not support DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } + } + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; -- cgit v1.2.3