summaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2015-06-04 05:01:49 +0200
committerDave Chinner <david@fromorbit.com>2015-06-04 05:01:49 +0200
commit66e8ac7bfa0a591b8cfaa447db2daa5b6e9221d0 (patch)
treee21331c5c8ad224d1b8d7f8762f78f4377c881ab /fs/xfs/xfs_file.c
parentMerge branch 'xfs-sparse-inode' into for-next (diff)
parentxfs: add initial DAX support (diff)
downloadlinux-66e8ac7bfa0a591b8cfaa447db2daa5b6e9221d0.tar.xz
linux-66e8ac7bfa0a591b8cfaa447db2daa5b6e9221d0.zip
Merge branch 'xfs-dax-support' into for-next
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c162
1 files changed, 98 insertions, 64 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 3b7591224f4a..84b2421b22ae 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -79,14 +79,15 @@ xfs_rw_ilock_demote(
}
/*
- * xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
*
- * xfs_iozero clears the specified range of buffer supplied,
- * and marks all the affected blocks as valid and modified. If
- * an affected block is not allocated, it will be allocated. If
- * an affected block is not completely overwritten, and is not
- * valid before the operation, it will be read from disk before
- * being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
*/
int
xfs_iozero(
@@ -96,7 +97,8 @@ xfs_iozero(
{
struct page *page;
struct address_space *mapping;
- int status;
+ int status = 0;
+
mapping = VFS_I(ip)->i_mapping;
do {
@@ -108,20 +110,27 @@ xfs_iozero(
if (bytes > count)
bytes = count;
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
+ if (IS_DAX(VFS_I(ip))) {
+ status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+ xfs_get_blocks_direct);
+ if (status)
+ break;
+ } else {
+ status = pagecache_write_begin(NULL, mapping, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (status)
+ break;
- zero_user(page, offset, bytes);
+ zero_user(page, offset, bytes);
- status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
- page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
+ status = pagecache_write_end(NULL, mapping, pos, bytes,
+ bytes, page, fsdata);
+ WARN_ON(status <= 0); /* can't return less than zero! */
+ status = 0;
+ }
pos += bytes;
count -= bytes;
- status = 0;
} while (count);
return status;
@@ -284,7 +293,7 @@ xfs_file_read_iter(
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
- if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+ if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -378,7 +387,11 @@ xfs_file_splice_read(
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+ /* for dax, we need to avoid the page cache */
+ if (IS_DAX(VFS_I(ip)))
+ ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+ else
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@@ -672,7 +685,7 @@ xfs_file_dio_aio_write(
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
- if ((pos | count) & target->bt_logical_sectormask)
+ if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
@@ -758,8 +771,11 @@ xfs_file_dio_aio_write(
out:
xfs_rw_iunlock(ip, iolock);
- /* No fallback to buffered IO on errors for XFS. */
- ASSERT(ret < 0 || ret == count);
+ /*
+ * No fallback to buffered IO on errors for XFS. DAX can result in
+ * partial writes, but direct IO will either complete fully or fail.
+ */
+ ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
return ret;
}
@@ -842,7 +858,7 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+ if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
ret = xfs_file_dio_aio_write(iocb, from);
else
ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1063,17 +1079,6 @@ xfs_file_readdir(
return xfs_readdir(ip, ctx, bufsize);
}
-STATIC int
-xfs_file_mmap(
- struct file *filp,
- struct vm_area_struct *vma)
-{
- vma->vm_ops = &xfs_file_vm_ops;
-
- file_accessed(filp);
- return 0;
-}
-
/*
* This type is designed to indicate the type of offset we would like
* to search from page cache for xfs_seek_hole_data().
@@ -1454,48 +1459,83 @@ xfs_file_llseek(
* ordering of:
*
* mmap_sem (MM)
- * i_mmap_lock (XFS - truncate serialisation)
- * page_lock (MM)
- * i_lock (XFS - extent map serialisation)
+ * sb_start_pagefault(vfs, freeze)
+ * i_mmap_lock (XFS - truncate serialisation)
+ * page_lock (MM)
+ * i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
*/
STATIC int
-xfs_filemap_fault(
+xfs_filemap_page_mkwrite(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
- struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
- int error;
+ struct inode *inode = file_inode(vma->vm_file);
+ int ret;
- trace_xfs_filemap_fault(ip);
+ trace_xfs_filemap_page_mkwrite(XFS_I(inode));
- xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- error = filemap_fault(vma, vmf);
- xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- return error;
+ if (IS_DAX(inode)) {
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
+ xfs_end_io_dax_write);
+ } else {
+ ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = block_page_mkwrite_return(ret);
+ }
+
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
}
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
STATIC int
-xfs_filemap_page_mkwrite(
+xfs_filemap_fault(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
- struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
- int error;
+ struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file));
+ int ret;
+
+ trace_xfs_filemap_fault(ip);
- trace_xfs_filemap_page_mkwrite(ip);
+ /* DAX can shortcut the normal fault path on write faults! */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
+ return xfs_filemap_page_mkwrite(vma, vmf);
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = filemap_fault(vma, vmf);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
- return error;
+ return ret;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+ .fault = xfs_filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = xfs_filemap_page_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+ struct file *filp,
+ struct vm_area_struct *vma)
+{
+ file_accessed(filp);
+ vma->vm_ops = &xfs_file_vm_ops;
+ if (IS_DAX(file_inode(filp)))
+ vma->vm_flags |= VM_MIXEDMAP;
+ return 0;
}
const struct file_operations xfs_file_operations = {
@@ -1526,9 +1566,3 @@ const struct file_operations xfs_dir_file_operations = {
#endif
.fsync = xfs_dir_fsync,
};
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
- .fault = xfs_filemap_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = xfs_filemap_page_mkwrite,
-};