diff options
Diffstat (limited to 'fs')
33 files changed, 1010 insertions, 870 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index b8fcb416be72..4524916fa200 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS if BLOCK +config FS_IOMAP + bool + source "fs/ext2/Kconfig" source "fs/ext4/Kconfig" source "fs/jbd2/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 85b6e13b62d3..ed2b63257ba9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o +obj-$(CONFIG_FS_IOMAP) += iomap.o obj-y += quota/ diff --git a/fs/buffer.c b/fs/buffer.c index 754813a6962b..228288a7de38 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/fs.h> +#include <linux/iomap.h> #include <linux/mm.h> #include <linux/percpu.h> #include <linux/slab.h> @@ -1891,8 +1892,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) } EXPORT_SYMBOL(page_zero_new_buffers); -int __block_write_begin(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block) +static void +iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, + struct iomap *iomap) +{ + loff_t offset = block << inode->i_blkbits; + + bh->b_bdev = iomap->bdev; + + /* + * Block points to offset in file we need to map, iomap contains + * the offset at which the map starts. If the map ends before the + * current block, then do not map the buffer and let the caller + * handle it. + */ + BUG_ON(offset >= iomap->offset + iomap->length); + + switch (iomap->type) { + case IOMAP_HOLE: + /* + * If the buffer is not up to date or beyond the current EOF, + * we need to mark it as new to ensure sub-block zeroing is + * executed if necessary. + */ + if (!buffer_uptodate(bh) || + (offset >= i_size_read(inode))) + set_buffer_new(bh); + break; + case IOMAP_DELALLOC: + if (!buffer_uptodate(bh) || + (offset >= i_size_read(inode))) + set_buffer_new(bh); + set_buffer_uptodate(bh); + set_buffer_mapped(bh); + set_buffer_delay(bh); + break; + case IOMAP_UNWRITTEN: + /* + * For unwritten regions, we always need to ensure that + * sub-block writes cause the regions in the block we are not + * writing to are zeroed. Set the buffer as new to ensure this. + */ + set_buffer_new(bh); + set_buffer_unwritten(bh); + /* FALLTHRU */ + case IOMAP_MAPPED: + if (offset >= i_size_read(inode)) + set_buffer_new(bh); + bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) + + ((offset - iomap->offset) >> inode->i_blkbits); + set_buffer_mapped(bh); + break; + } +} + +int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -1928,9 +1983,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); - if (err) - break; + if (get_block) { + err = get_block(inode, block, bh, 1); + if (err) + break; + } else { + iomap_to_bh(inode, block, bh, iomap); + } + if (buffer_new(bh)) { unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); @@ -1971,6 +2031,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, page_zero_new_buffers(page, from, to); return err; } + +int __block_write_begin(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block) +{ + return __block_write_begin_int(page, pos, len, get_block, NULL); +} EXPORT_SYMBOL(__block_write_begin); static int __block_commit_write(struct inode *inode, struct page *page, diff --git a/fs/internal.h b/fs/internal.h index b71deeecea17..c0c6f493ab8a 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -11,6 +11,7 @@ struct super_block; struct file_system_type; +struct iomap; struct linux_binprm; struct path; struct mount; @@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) * buffer.c */ extern void guard_bio_eod(int rw, struct bio *bio); +extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, struct iomap *iomap); /* * char_dev.c diff --git a/fs/iomap.c b/fs/iomap.c new file mode 100644 index 000000000000..48141b8eff5f --- /dev/null +++ b/fs/iomap.c @@ -0,0 +1,497 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (c) 2016 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/iomap.h> +#include <linux/uaccess.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/backing-dev.h> +#include <linux/buffer_head.h> +#include <linux/dax.h> +#include "internal.h" + +typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, + void *data, struct iomap *iomap); + +/* + * Execute a iomap write on a segment of the mapping that spans a + * contiguous range of pages that have identical block mapping state. + * + * This avoids the need to map pages individually, do individual allocations + * for each page and most importantly avoid the need for filesystem specific + * locking per page. Instead, all the operations are amortised over the entire + * range of pages. It is assumed that the filesystems will lock whatever + * resources they require in the iomap_begin call, and release them in the + * iomap_end call. + */ +static loff_t +iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, + struct iomap_ops *ops, void *data, iomap_actor_t actor) +{ + struct iomap iomap = { 0 }; + loff_t written = 0, ret; + + /* + * Need to map a range from start position for length bytes. This can + * span multiple pages - it is only guaranteed to return a range of a + * single type of pages (e.g. all into a hole, all mapped or all + * unwritten). Failure at this point has nothing to undo. + * + * If allocation is required for this range, reserve the space now so + * that the allocation is guaranteed to succeed later on. Once we copy + * the data into the page cache pages, then we cannot fail otherwise we + * expose transient stale data. If the reserve fails, we can safely + * back out at this point as there is nothing to undo. + */ + ret = ops->iomap_begin(inode, pos, length, flags, &iomap); + if (ret) + return ret; + if (WARN_ON(iomap.offset > pos)) + return -EIO; + + /* + * Cut down the length to the one actually provided by the filesystem, + * as it might not be able to give us the whole size that we requested. + */ + if (iomap.offset + iomap.length < pos + length) + length = iomap.offset + iomap.length - pos; + + /* + * Now that we have guaranteed that the space allocation will succeed. + * we can do the copy-in page by page without having to worry about + * failures exposing transient data. + */ + written = actor(inode, pos, length, data, &iomap); + + /* + * Now the data has been copied, commit the range we've copied. This + * should not fail unless the filesystem has had a fatal error. + */ + ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0, + flags, &iomap); + + return written ? written : ret; +} + +static void +iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) +{ + loff_t i_size = i_size_read(inode); + + /* + * Only truncate newly allocated pages beyoned EOF, even if the + * write started inside the existing inode size. + */ + if (pos + len > i_size) + truncate_pagecache_range(inode, max(pos, i_size), pos + len); +} + +static int +iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, + struct page **pagep, struct iomap *iomap) +{ + pgoff_t index = pos >> PAGE_SHIFT; + struct page *page; + int status = 0; + + BUG_ON(pos + len > iomap->offset + iomap->length); + + page = grab_cache_page_write_begin(inode->i_mapping, index, flags); + if (!page) + return -ENOMEM; + + status = __block_write_begin_int(page, pos, len, NULL, iomap); + if (unlikely(status)) { + unlock_page(page); + put_page(page); + page = NULL; + + iomap_write_failed(inode, pos, len); + } + + *pagep = page; + return status; +} + +static int +iomap_write_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page) +{ + int ret; + + ret = generic_write_end(NULL, inode->i_mapping, pos, len, + copied, page, NULL); + if (ret < len) + iomap_write_failed(inode, pos, len); + return ret; +} + +static loff_t +iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iov_iter *i = data; + long status = 0; + ssize_t written = 0; + unsigned int flags = AOP_FLAG_NOFS; + + /* + * Copies from kernel address space cannot fail (NFSD is a big user). + */ + if (!iter_is_iovec(i)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_count(i)); +again: + if (bytes > length) + bytes = length; + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = iomap_write_begin(inode, pos, bytes, flags, &page, + iomap); + if (unlikely(status)) + break; + + if (mapping_writably_mapped(inode->i_mapping)) + flush_dcache_page(page); + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + + flush_dcache_page(page); + mark_page_accessed(page); + + status = iomap_write_end(inode, pos, bytes, copied, page); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + iov_iter_advance(i, copied); + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + pos += copied; + written += copied; + length -= copied; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (iov_iter_count(i) && length); + + return written ? written : status; +} + +ssize_t +iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + loff_t pos = iocb->ki_pos, ret = 0, written = 0; + + while (iov_iter_count(iter)) { + ret = iomap_apply(inode, pos, iov_iter_count(iter), + IOMAP_WRITE, ops, iter, iomap_write_actor); + if (ret <= 0) + break; + pos += ret; + written += ret; + } + + return written ? written : ret; +} +EXPORT_SYMBOL_GPL(iomap_file_buffered_write); + +static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, + unsigned bytes, struct iomap *iomap) +{ + struct page *page; + int status; + + status = iomap_write_begin(inode, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap); + if (status) + return status; + + zero_user(page, offset, bytes); + mark_page_accessed(page); + + return iomap_write_end(inode, pos, bytes, bytes, page); +} + +static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, + struct iomap *iomap) +{ + sector_t sector = iomap->blkno + + (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); + + return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); +} + +static loff_t +iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, + void *data, struct iomap *iomap) +{ + bool *did_zero = data; + loff_t written = 0; + int status; + + /* already zeroed? we're done. */ + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + return count; + + do { + unsigned offset, bytes; + + offset = pos & (PAGE_SIZE - 1); /* Within page */ + bytes = min_t(unsigned, PAGE_SIZE - offset, count); + + if (IS_DAX(inode)) + status = iomap_dax_zero(pos, offset, bytes, iomap); + else + status = iomap_zero(inode, pos, offset, bytes, iomap); + if (status < 0) + return status; + + pos += bytes; + count -= bytes; + written += bytes; + if (did_zero) + *did_zero = true; + } while (count > 0); + + return written; +} + +int +iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, + struct iomap_ops *ops) +{ + loff_t ret; + + while (len > 0) { + ret = iomap_apply(inode, pos, len, IOMAP_ZERO, + ops, did_zero, iomap_zero_range_actor); + if (ret <= 0) + return ret; + + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_zero_range); + +int +iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, + struct iomap_ops *ops) +{ + unsigned blocksize = (1 << inode->i_blkbits); + unsigned off = pos & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!off) + return 0; + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); +} +EXPORT_SYMBOL_GPL(iomap_truncate_page); + +static loff_t +iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct page *page = data; + int ret; + + ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length, + NULL, iomap); + if (ret) + return ret; + + block_commit_write(page, 0, length); + return length; +} + +int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + unsigned long length; + loff_t offset, size; + ssize_t ret; + + lock_page(page); + size = i_size_read(inode); + if ((page->mapping != inode->i_mapping) || + (page_offset(page) > size)) { + /* We overload EFAULT to mean page got truncated */ + ret = -EFAULT; + goto out_unlock; + } + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_SHIFT) > size) + length = size & ~PAGE_MASK; + else + length = PAGE_SIZE; + + offset = page_offset(page); + while (length > 0) { + ret = iomap_apply(inode, offset, length, IOMAP_WRITE, + ops, page, iomap_page_mkwrite_actor); + if (unlikely(ret <= 0)) + goto out_unlock; + offset += ret; + length -= ret; + } + + set_page_dirty(page); + wait_for_stable_page(page); + return 0; +out_unlock: + unlock_page(page); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_page_mkwrite); + +struct fiemap_ctx { + struct fiemap_extent_info *fi; + struct iomap prev; +}; + +static int iomap_to_fiemap(struct fiemap_extent_info *fi, + struct iomap *iomap, u32 flags) +{ + switch (iomap->type) { + case IOMAP_HOLE: + /* skip holes */ + return 0; + case IOMAP_DELALLOC: + flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; + break; + case IOMAP_UNWRITTEN: + flags |= FIEMAP_EXTENT_UNWRITTEN; + break; + case IOMAP_MAPPED: + break; + } + + return fiemap_fill_next_extent(fi, iomap->offset, + iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, + iomap->length, flags | FIEMAP_EXTENT_MERGED); + +} + +static loff_t +iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct fiemap_ctx *ctx = data; + loff_t ret = length; + + if (iomap->type == IOMAP_HOLE) + return length; + + ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); + ctx->prev = *iomap; + switch (ret) { + case 0: /* success */ + return length; + case 1: /* extent array full */ + return 0; + default: + return ret; + } +} + +int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, + loff_t start, loff_t len, struct iomap_ops *ops) +{ + struct fiemap_ctx ctx; + loff_t ret; + + memset(&ctx, 0, sizeof(ctx)); + ctx.fi = fi; + ctx.prev.type = IOMAP_HOLE; + + ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + return ret; + + while (len > 0) { + ret = iomap_apply(inode, start, len, 0, ops, &ctx, + iomap_fiemap_actor); + if (ret < 0) + return ret; + if (ret == 0) + break; + + start += ret; + len -= ret; + } + + if (ctx.prev.type != IOMAP_HOLE) { + ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); + if (ret < 0) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_fiemap); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index e55b5242614d..4df16ae1633a 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -2,6 +2,7 @@ * Copyright (c) 2014-2016 Christoph Hellwig. */ #include <linux/exportfs.h> +#include <linux/iomap.h> #include <linux/genhd.h> #include <linux/slab.h> #include <linux/pr.h> diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 6c3b316f932e..4ebaaf4b8d8a 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -3,6 +3,7 @@ */ #include <linux/sunrpc/svc.h> #include <linux/exportfs.h> +#include <linux/iomap.h> #include <linux/nfs4.h> #include "nfsd.h" diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5d47b4df61ea..35faf128f36d 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -4,6 +4,7 @@ config XFS_FS depends on (64BIT || LBDAF) select EXPORTFS select LIBCRC32C + select FS_IOMAP help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index e56991d1a970..88c26b827a2d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -84,7 +84,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -int /* error */ +static int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index f5b35dc594de..cf268b2d0b6c 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -212,13 +212,6 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ -int /* error */ -xfs_alloc_lookup_le( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat); /* success/failure */ - int /* error */ xfs_alloc_lookup_ge( struct xfs_btree_cur *cur, /* btree cursor */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 882c8d338891..4f2aed04f827 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -50,7 +50,6 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); -int xfs_attr_shortform_list(struct xfs_attr_list_context *context); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); @@ -88,8 +87,6 @@ int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); - /* * Utility routines. */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index fffe3d01bd9f..f5ec9c5ccae6 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -521,12 +521,8 @@ typedef struct xfs_swapext #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ -/* XFS_IOC_FREEZE -- FIFREEZE 119 */ -/* XFS_IOC_THAW -- FITHAW 120 */ -#ifndef FIFREEZE -#define XFS_IOC_FREEZE _IOWR('X', 119, int) -#define XFS_IOC_THAW _IOWR('X', 120, int) -#endif +#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ +#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 951c044e24e4..e2e1106c9fad 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -int +static int xfs_rtbuf_get( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4c463b99fe57..80714ebd54c0 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1143,6 +1143,8 @@ __xfs_get_blocks( ssize_t size; int new = 0; + BUG_ON(create && !direct); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1150,22 +1152,14 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && direct && offset >= i_size_read(inode)) + if (!create && offset >= i_size_read(inode)) return 0; /* * Direct I/O is usually done on preallocated files, so try getting - * a block mapping without an exclusive lock first. For buffered - * writes we already have the exclusive iolock anyway, so avoiding - * a lock roundtrip here by taking the ilock exclusive from the - * beginning is a useful micro optimization. + * a block mapping without an exclusive lock first. */ - if (create && !direct) { - lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockmode); - } else { - lockmode = xfs_ilock_data_map_shared(ip); - } + lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); if (offset + size > mp->m_super->s_maxbytes) @@ -1184,37 +1178,19 @@ __xfs_get_blocks( (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK) || (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - if (direct || xfs_get_extsz_hint(ip)) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); - } else { - /* - * Delalloc reservations do not require a transaction, - * we can go on without dropping the lock here. If we - * are allocating a new delalloc block, make sure that - * we set the new flag so that we mark the buffer new so - * that we know that it is newly allocated if the write - * fails. - */ - if (nimaps && imap.br_startblock == HOLESTARTBLOCK) - new = 1; - error = xfs_iomap_write_delay(ip, offset, size, &imap); - if (error) - goto out_unlock; + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + if (error) + return error; + new = 1; - xfs_iunlock(ip, lockmode); - } trace_xfs_get_blocks_alloc(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_DELALLOC, &imap); @@ -1235,9 +1211,7 @@ __xfs_get_blocks( } /* trim mapping down to size requested */ - if (direct || size > (1 << inode->i_blkbits)) - xfs_map_trim_size(inode, iblock, bh_result, - &imap, offset, size); + xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); /* * For unwritten extents do not report a disk address in the buffered @@ -1250,7 +1224,7 @@ __xfs_get_blocks( if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); /* direct IO needs special help */ - if (create && direct) { + if (create) { if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else @@ -1279,14 +1253,7 @@ __xfs_get_blocks( (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (imap.br_startblock == DELAYSTARTBLOCK) { - BUG_ON(direct); - if (create) { - set_buffer_uptodate(bh_result); - set_buffer_mapped(bh_result); - set_buffer_delay(bh_result); - } - } + BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); return 0; @@ -1427,216 +1394,6 @@ xfs_vm_direct_IO( xfs_get_blocks_direct, endio, NULL, flags); } -/* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have made it to disk yet - * as the page is still locked at this point. - */ -STATIC void -xfs_vm_kill_delalloc_range( - struct inode *inode, - loff_t start, - loff_t end) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; - - start_fsb = XFS_B_TO_FSB(ip->i_mount, start); - end_fsb = XFS_B_TO_FSB(ip->i_mount, end); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); -} - -STATIC void -xfs_vm_write_failed( - struct inode *inode, - struct page *page, - loff_t pos, - unsigned len) -{ - loff_t block_offset; - loff_t block_start; - loff_t block_end; - loff_t from = pos & (PAGE_SIZE - 1); - loff_t to = from + len; - struct buffer_head *bh, *head; - struct xfs_mount *mp = XFS_I(inode)->i_mount; - - /* - * The request pos offset might be 32 or 64 bit, this is all fine - * on 64-bit platform. However, for 64-bit pos request on 32-bit - * platform, the high 32-bit will be masked off if we evaluate the - * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is - * 0xfffff000 as an unsigned long, hence the result is incorrect - * which could cause the following ASSERT failed in most cases. - * In order to avoid this, we can evaluate the block_offset of the - * start of the page by using shifts rather than masks the mismatch - * problem. - */ - block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; - - ASSERT(block_offset + from == pos); - - head = page_buffers(page); - block_start = 0; - for (bh = head; bh != head || !block_start; - bh = bh->b_this_page, block_start = block_end, - block_offset += bh->b_size) { - block_end = block_start + bh->b_size; - - /* skip buffers before the write */ - if (block_end <= from) - continue; - - /* if the buffer is after the write, we're done */ - if (block_start >= to) - break; - - /* - * Process delalloc and unwritten buffers beyond EOF. We can - * encounter unwritten buffers in the event that a file has - * post-EOF unwritten extents and an extending write happens to - * fail (e.g., an unaligned write that also involves a delalloc - * to the same page). - */ - if (!buffer_delay(bh) && !buffer_unwritten(bh)) - continue; - - if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && - block_offset < i_size_read(inode)) - continue; - - if (buffer_delay(bh)) - xfs_vm_kill_delalloc_range(inode, block_offset, - block_offset + bh->b_size); - - /* - * This buffer does not contain data anymore. make sure anyone - * who finds it knows that for certain. - */ - clear_buffer_delay(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_dirty(bh); - clear_buffer_unwritten(bh); - } - -} - -/* - * This used to call block_write_begin(), but it unlocks and releases the page - * on error, and we need that page to be able to punch stale delalloc blocks out - * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at - * the appropriate point. - */ -STATIC int -xfs_vm_write_begin( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; - int status; - struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; - - ASSERT(len <= PAGE_SIZE); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - - status = __block_write_begin(page, pos, len, xfs_get_blocks); - if (xfs_mp_fail_writes(mp)) - status = -EIO; - if (unlikely(status)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - - xfs_vm_write_failed(inode, page, pos, len); - unlock_page(page); - - /* - * If the write is beyond EOF, we only want to kill blocks - * allocated in this write, not blocks that were previously - * written successfully. - */ - if (xfs_mp_fail_writes(mp)) - isize = 0; - if (pos + len > isize) { - ssize_t start = max_t(ssize_t, pos, isize); - - truncate_pagecache_range(inode, start, pos + len); - } - - put_page(page); - page = NULL; - } - - *pagep = page; - return status; -} - -/* - * On failure, we only need to kill delalloc blocks beyond EOF in the range of - * this specific write because they will never be written. Previous writes - * beyond EOF where block allocation succeeded do not need to be trashed, so - * only new blocks from this write should be trashed. For blocks within - * EOF, generic_write_end() zeros them so they are safe to leave alone and be - * written with all the other valid data. - */ -STATIC int -xfs_vm_write_end( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned copied, - struct page *page, - void *fsdata) -{ - int ret; - - ASSERT(len <= PAGE_SIZE); - - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - loff_t to = pos + len; - - if (to > isize) { - /* only kill blocks in this write beyond EOF */ - if (pos > isize) - isize = pos; - xfs_vm_kill_delalloc_range(inode, isize, to); - truncate_pagecache_range(inode, isize, to); - } - } - return ret; -} - STATIC sector_t xfs_vm_bmap( struct address_space *mapping, @@ -1747,8 +1504,6 @@ const struct address_space_operations xfs_address_space_operations = { .set_page_dirty = xfs_vm_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .write_begin = xfs_vm_write_begin, - .write_end = xfs_vm_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 55d214981ed2..be0b79d8900f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -322,7 +322,7 @@ xfs_attr3_node_inactive( * Recurse (gasp!) through the attribute nodes until we find leaves. * We're doing a depth-first traversal in order to invalidate everything. */ -int +static int xfs_attr3_root_inactive( struct xfs_trans **trans, struct xfs_inode *dp) diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index d25f26b22ac9..25e76cd6c053 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b) * we have to calculate each entries' hashvalue and sort them before * we can begin returning them to the user. */ -int +static int xfs_attr_shortform_list(xfs_attr_list_context_t *context) { attrlist_cursor_kern_t *cursor; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c53e07a87cd7..cd4a850564f2 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -143,9 +143,7 @@ xfs_bmap_finish( if (committed) { xfs_efi_release(efi); xfs_force_shutdown((*tp)->t_mountp, - (error == -EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); + SHUTDOWN_META_IO_ERROR); } return error; } @@ -427,7 +425,7 @@ xfs_bmap_count_tree( /* * Count fsblocks of the given fork. */ -int /* error */ +static int /* error */ xfs_bmap_count_blocks( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode */ @@ -1107,99 +1105,120 @@ error1: /* Just cancel transaction */ return error; } -/* - * Zero file bytes between startoff and endoff inclusive. - * The iolock is held exclusive and no blocks are buffered. - * - * This function is used by xfs_free_file_space() to zero - * partial blocks when the range to free is not block aligned. - * When unreserving space with boundaries that are not block - * aligned we round up the start and round down the end - * boundaries and then use this function to zero the parts of - * the blocks that got dropped during the rounding. - */ -STATIC int -xfs_zero_remaining_bytes( - xfs_inode_t *ip, - xfs_off_t startoff, - xfs_off_t endoff) +static int +xfs_unmap_extent( + struct xfs_inode *ip, + xfs_fileoff_t startoffset_fsb, + xfs_filblks_t len_fsb, + int *done) { - xfs_bmbt_irec_t imap; - xfs_fileoff_t offset_fsb; - xfs_off_t lastoffset; - xfs_off_t offset; - xfs_buf_t *bp; - xfs_mount_t *mp = ip->i_mount; - int nimap; - int error = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + int error; - /* - * Avoid doing I/O beyond eof - it's not necessary - * since nothing can read beyond eof. The space will - * be zeroed when the file is extended anyway. - */ - if (startoff >= XFS_ISIZE(ip)) - return 0; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) { + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + return error; + } - if (endoff > XFS_ISIZE(ip)) - endoff = XFS_ISIZE(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot, + ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_trans_cancel; - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { - uint lock_mode; + xfs_trans_ijoin(tp, ip, 0); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - nimap = 1; + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb, + &free_list, done); + if (error) + goto out_bmap_cancel; - lock_mode = xfs_ilock_data_map_shared(ip); - error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); - xfs_iunlock(ip, lock_mode); + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) + goto out_bmap_cancel; - if (error || nimap < 1) - break; - ASSERT(imap.br_blockcount >= 1); - ASSERT(imap.br_startoff == offset_fsb); - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + error = xfs_trans_commit(tp); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; - if (imap.br_startblock == HOLESTARTBLOCK || - imap.br_state == XFS_EXT_UNWRITTEN) { - /* skip the entire extent */ - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + - imap.br_blockcount) - 1; - continue; - } +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: + xfs_trans_cancel(tp); + goto out_unlock; +} - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; - if (lastoffset > endoff) - lastoffset = endoff; +static int +xfs_adjust_extent_unmap_boundaries( + struct xfs_inode *ip, + xfs_fileoff_t *startoffset_fsb, + xfs_fileoff_t *endoffset_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + int nimap, error; + xfs_extlen_t mod = 0; - /* DAX can just zero the backing device directly */ - if (IS_DAX(VFS_I(ip))) { - error = dax_zero_page_range(VFS_I(ip), offset, - lastoffset - offset + 1, - xfs_get_blocks_direct); - if (error) - return error; - continue; - } + nimap = 1; + error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0); + if (error) + return error; - error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - xfs_fsb_to_db(ip, imap.br_startblock), - BTOBB(mp->m_sb.sb_blocksize), - 0, &bp, NULL); - if (error) - return error; + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; - memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + *startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - return error; + nimap = 1; + error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0); + if (error) + return error; + + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && mod != mp->m_sb.sb_rextsize) + *endoffset_fsb -= mod; } - return error; + + return 0; +} + +static int +xfs_flush_unmap_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + xfs_off_t rounding, start, end; + int error; + + /* wait for the completion of any pending DIOs */ + inode_dio_wait(inode); + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); + start = round_down(offset, rounding); + end = round_up(offset + len, rounding) - 1; + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + truncate_pagecache_range(inode, start, end); + return 0; } int @@ -1208,24 +1227,10 @@ xfs_free_file_space( xfs_off_t offset, xfs_off_t len) { - int done; - xfs_fileoff_t endoffset_fsb; - int error; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - xfs_bmbt_irec_t imap; - xfs_off_t ioffset; - xfs_off_t iendoffset; - xfs_extlen_t mod=0; - xfs_mount_t *mp; - int nimap; - uint resblks; - xfs_off_t rounding; - int rt; + struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; - xfs_trans_t *tp; - - mp = ip->i_mount; + xfs_fileoff_t endoffset_fsb; + int done = 0, error; trace_xfs_free_file_space(ip); @@ -1233,135 +1238,45 @@ xfs_free_file_space( if (error) return error; - error = 0; if (len <= 0) /* if nothing being freed */ - return error; - rt = XFS_IS_REALTIME_INODE(ip); - startoffset_fsb = XFS_B_TO_FSB(mp, offset); - endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - - /* wait for the completion of any pending DIOs */ - inode_dio_wait(VFS_I(ip)); + return 0; - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); - ioffset = round_down(offset, rounding); - iendoffset = round_up(offset + len, rounding) - 1; - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, - iendoffset); + error = xfs_flush_unmap_range(ip, offset, len); if (error) - goto out; - truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); + return error; + + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* - * Need to zero the stuff we're not freeing, on disk. - * If it's a realtime file & can't use unwritten extents then we - * actually need to zero the extent edges. Otherwise xfs_bunmapi - * will take care of it for us. + * Need to zero the stuff we're not freeing, on disk. If it's a RT file + * and we can't use unwritten extents then we actually need to ensure + * to zero the whole extent, otherwise we just need to take of block + * boundaries, and xfs_bunmapi will handle the rest. */ - if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - nimap = 1; - error = xfs_bmapi_read(ip, startoffset_fsb, 1, - &imap, &nimap, 0); + if (XFS_IS_REALTIME_INODE(ip) && + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb, + &endoffset_fsb); if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); - if (mod) - startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - nimap = 1; - error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, - &imap, &nimap, 0); - if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && (mod != mp->m_sb.sb_rextsize)) - endoffset_fsb -= mod; - } - } - if ((done = (endoffset_fsb <= startoffset_fsb))) - /* - * One contiguous piece to clear - */ - error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); - else { - /* - * Some full blocks, possibly two pieces to clear - */ - if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) - error = xfs_zero_remaining_bytes(ip, offset, - XFS_FSB_TO_B(mp, startoffset_fsb) - 1); - if (!error && - XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) - error = xfs_zero_remaining_bytes(ip, - XFS_FSB_TO_B(mp, endoffset_fsb), - offset + len - 1); + return error; } - /* - * free file space until done or until there is an error - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - while (!error && !done) { - - /* - * allocate and setup the transaction. Allow this - * transaction to dip into the reserve blocks to ensure - * the freeing of the space succeeds at ENOSPC. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, - &tp); - if (error) { - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - break; + if (endoffset_fsb > startoffset_fsb) { + while (!done) { + error = xfs_unmap_extent(ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, &done); + if (error) + return error; } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, - ip->i_udquot, ip->i_gdquot, ip->i_pdquot, - resblks, 0, XFS_QMOPT_RES_REGBLKS); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, 0); - - /* - * issue the bunmapi() call to free the blocks - */ - xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bunmapi(tp, ip, startoffset_fsb, - endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, &done); - if (error) - goto error0; - - /* - * complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, NULL); - if (error) - goto error0; - - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } - out: - return error; - - error0: - xfs_bmap_cancel(&free_list); - error1: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out; + /* + * Now that we've unmap all full blocks we'll have to zero out any + * partial block at the beginning and/or end. xfs_zero_range is + * smart enough to skip any holes, including those we just created. + */ + return xfs_zero_range(ip, offset, len, NULL); } /* diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 4ec85d1043a0..f20071432ca6 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -31,8 +31,6 @@ struct xfs_bmalloca; int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); -int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, - int whichfork, int *count); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f14daebbc531..4665ff6e5153 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1776,18 +1776,33 @@ xfs_buf_cmp( return 0; } +/* + * submit buffers for write. + * + * When we have a large buffer list, we do not want to hold all the buffers + * locked while we block on the request queue waiting for IO dispatch. To avoid + * this problem, we lock and submit buffers in groups of 50, thereby minimising + * the lock hold times for lists which may contain thousands of objects. + * + * To do this, we sort the buffer list before we walk the list to lock and + * submit buffers, and we plug and unplug around each group of buffers we + * submit. + */ static int -__xfs_buf_delwri_submit( +xfs_buf_delwri_submit_buffers( struct list_head *buffer_list, - struct list_head *io_list, - bool wait) + struct list_head *wait_list) { - struct blk_plug plug; struct xfs_buf *bp, *n; + LIST_HEAD (submit_list); int pinned = 0; + struct blk_plug plug; + + list_sort(NULL, buffer_list, xfs_buf_cmp); + blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { - if (!wait) { + if (!wait_list) { if (xfs_buf_ispinned(bp)) { pinned++; continue; @@ -1810,25 +1825,21 @@ __xfs_buf_delwri_submit( continue; } - list_move_tail(&bp->b_list, io_list); trace_xfs_buf_delwri_split(bp, _RET_IP_); - } - - list_sort(NULL, io_list, xfs_buf_cmp); - - blk_start_plug(&plug); - list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); - bp->b_flags |= XBF_WRITE | XBF_ASYNC; /* - * we do all Io submission async. This means if we need to wait - * for IO completion we need to take an extra reference so the - * buffer is still valid on the other side. + * We do all IO submission async. This means if we need + * to wait for IO completion we need to take an extra + * reference so the buffer is still valid on the other + * side. We need to move the buffer onto the io_list + * at this point so the caller can still access it. */ - if (wait) + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); + bp->b_flags |= XBF_WRITE | XBF_ASYNC; + if (wait_list) { xfs_buf_hold(bp); - else + list_move_tail(&bp->b_list, wait_list); + } else list_del_init(&bp->b_list); xfs_buf_submit(bp); @@ -1851,8 +1862,7 @@ int xfs_buf_delwri_submit_nowait( struct list_head *buffer_list) { - LIST_HEAD (io_list); - return __xfs_buf_delwri_submit(buffer_list, &io_list, false); + return xfs_buf_delwri_submit_buffers(buffer_list, NULL); } /* @@ -1867,15 +1877,15 @@ int xfs_buf_delwri_submit( struct list_head *buffer_list) { - LIST_HEAD (io_list); + LIST_HEAD (wait_list); int error = 0, error2; struct xfs_buf *bp; - __xfs_buf_delwri_submit(buffer_list, &io_list, true); + xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); /* Wait for IO to complete. */ - while (!list_empty(&io_list)) { - bp = list_first_entry(&io_list, struct xfs_buf, b_list); + while (!list_empty(&wait_list)) { + bp = list_first_entry(&wait_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 34257992934c..2e95ad036316 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -359,7 +359,7 @@ xfs_buf_item_format( for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); - offset += bp->b_maps[i].bm_len; + offset += BBTOB(bp->b_maps[i].bm_len); } /* @@ -915,20 +915,28 @@ xfs_buf_item_log( for (i = 0; i < bip->bli_format_count; i++) { if (start > last) break; - end = start + BBTOB(bp->b_maps[i].bm_len); + end = start + BBTOB(bp->b_maps[i].bm_len) - 1; + + /* skip to the map that includes the first byte to log */ if (first > end) { start += BBTOB(bp->b_maps[i].bm_len); continue; } + + /* + * Trim the range to this segment and mark it in the bitmap. + * Note that we must convert buffer offsets to segment relative + * offsets (e.g., the first byte of each segment is byte 0 of + * that segment). + */ if (first < start) first = start; if (end > last) end = last; - - xfs_buf_item_log_segment(first, end, + xfs_buf_item_log_segment(first - start, end - start, &bip->bli_formats[i].blf_data_map[0]); - start += bp->b_maps[i].bm_len; + start += BBTOB(bp->b_maps[i].bm_len); } } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 47fc63295422..713991c22781 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -37,6 +37,7 @@ #include "xfs_log.h" #include "xfs_icache.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -80,61 +81,17 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero clears the specified range supplied via the page cache (except in - * the DAX case). Writes through the page cache will allocate blocks over holes, - * though the callers usually map the holes first and avoid them. If a block is - * not completely zeroed, then it will be read from disk before being partially - * zeroed. - * - * In the DAX case, we can just directly write to the underlying pages. This - * will not allocate blocks, but will avoid holes and unwritten extents and so - * not do unnecessary work. + * Clear the specified ranges to zero through either the pagecache or DAX. + * Holes and unwritten extents will be left as-is as they already are zeroed. */ int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ +xfs_zero_range( + struct xfs_inode *ip, + xfs_off_t pos, + xfs_off_t count, + bool *did_zero) { - struct page *page; - struct address_space *mapping; - int status = 0; - - - mapping = VFS_I(ip)->i_mapping; - do { - unsigned offset, bytes; - void *fsdata; - - offset = (pos & (PAGE_SIZE -1)); /* Within page */ - bytes = PAGE_SIZE - offset; - if (bytes > count) - bytes = count; - - if (IS_DAX(VFS_I(ip))) { - status = dax_zero_page_range(VFS_I(ip), pos, bytes, - xfs_get_blocks_direct); - if (status) - break; - } else { - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); - - status = pagecache_write_end(NULL, mapping, pos, bytes, - bytes, page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - status = 0; - } - pos += bytes; - count -= bytes; - } while (count); - - return status; + return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); } int @@ -424,49 +381,6 @@ out: } /* - * This routine is called to handle zeroing any space in the last block of the - * file that is beyond the EOF. We do this since the size is being increased - * without writing anything to that block and we don't want to read the - * garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - struct xfs_inode *ip, - xfs_fsize_t offset, - xfs_fsize_t isize, - bool *did_zeroing) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); - int zero_offset = XFS_B_FSB_OFFSET(mp, isize); - int zero_len; - int nimaps = 1; - int error = 0; - struct xfs_bmbt_irec imap; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) - return 0; - - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - *did_zeroing = true; - return xfs_iozero(ip, isize, zero_len); -} - -/* * Zero any on disk space between the current EOF and the new, larger EOF. * * This handles the normal case of zeroing the remainder of the last block in @@ -484,94 +398,11 @@ xfs_zero_eof( xfs_fsize_t isize, /* current inode size */ bool *did_zeroing) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - struct xfs_bmbt_irec imap; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); trace_xfs_zero_eof(ip, isize, offset - isize); - - /* - * First handle zeroing the block on which isize resides. - * - * We only zero a part of that block so it is handled specially. - */ - if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize, did_zeroing); - if (error) - return error; - } - - /* - * Calculate the range between the new size and the old where blocks - * needing to be zeroed may exist. - * - * To get the block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back to a block - * boundary. We subtract 1 in case the size is exactly on a block - * boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, - &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - */ - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); - - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; - - error = xfs_iozero(ip, zero_off, zero_len); - if (error) - return error; - - *did_zeroing = true; - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - } - - return 0; + return xfs_zero_range(ip, isize, offset - isize, did_zeroing); } /* @@ -841,7 +672,7 @@ xfs_file_buffered_aio_write( write_retry: trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos, 0); - ret = generic_perform_write(file, from, iocb->ki_pos); + ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; @@ -1553,7 +1384,7 @@ xfs_filemap_page_mkwrite( if (IS_DAX(inode)) { ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { - ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ee6799e0476f..8825bcfd314c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) * lock more than one at a time, lockdep will report false positives saying we * have violated locking orders. */ -void +static void xfs_lock_inodes( xfs_inode_t **ips, int inodes, @@ -667,14 +667,6 @@ xfs_ip2xflags( return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } -uint -xfs_dic2xflags( - struct xfs_dinode *dip) -{ - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), - be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); -} - /* * Lookups up an inode from "name". If ci_name is not NULL, then a CI match * is allowed, otherwise it has to be an exact match. If a CI match is found, @@ -748,7 +740,7 @@ out_unlock: * are not linked into the directory structure - they are attached * directly to the superblock - and so have no parent. */ -int +static int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, @@ -1085,7 +1077,7 @@ xfs_dir_ialloc( * link count to go to zero, move the inode to AGI unlinked list so that it can * be freed when the last active reference goes away via xfs_inactive(). */ -int /* error */ +static int /* error */ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1104,7 +1096,7 @@ xfs_droplink( /* * Increment the link count on an inode & log the change. */ -int +static int xfs_bumplink( xfs_trans_t *tp, xfs_inode_t *ip) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e52d7c7aeb5b..0c19d3d05a91 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -395,12 +395,8 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, - xfs_nlink_t, xfs_dev_t, prid_t, int, - struct xfs_buf **, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); -uint xfs_dic2xflags(struct xfs_dinode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, @@ -411,7 +407,6 @@ void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) int xfs_iflush(struct xfs_inode *, struct xfs_buf **); -void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); @@ -419,8 +414,6 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_inode **, int *); -int xfs_droplink(struct xfs_trans *, struct xfs_inode *); -int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); /* from xfs_file.c */ enum xfs_prealloc_flags { @@ -434,7 +427,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, xfs_fsize_t isize, bool *did_zeroing); -int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count, + bool *did_zero); loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, loff_t eof, int whence); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 58391355a44d..620fc9120444 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -15,6 +15,7 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <linux/iomap.h> #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" @@ -940,3 +941,173 @@ error_on_bmapi_transaction: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } + +void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); +} + +static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +{ + return !nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_startblock == DELAYSTARTBLOCK; +} + +static int +xfs_file_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + int nimaps = 1, error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + ASSERT(offset <= mp->m_super->s_maxbytes); + if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) + length = mp->m_super->s_maxbytes - offset; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, XFS_BMAPI_ENTIRE); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat symmetric + * with the work writeback does. This is a completely arbitrary + * number pulled out of thin air as a best guess for initial + * testing. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + length = min_t(loff_t, length, 1024 * PAGE_SIZE); + if (xfs_get_extsz_hint(ip)) { + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + xfs_ilock_demote(ip, XFS_ILOCK_EXCL); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); + } else { + error = xfs_iomap_write_delay(ip, offset, length, &imap); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else if (nimaps) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_found(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_not_found(ip, offset, length, 0, &imap); + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + iomap->offset = offset; + iomap->length = length; + } + + return 0; +} + +static int +xfs_file_iomap_end_delalloc( + struct xfs_inode *ip, + loff_t offset, + loff_t length, + ssize_t written) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + + start_fsb = XFS_B_TO_FSB(mp, offset + written); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + /* + * Trim back delalloc blocks if we didn't manage to write the whole + * range reserved. + * + * We don't need to care about racing delalloc as we hold i_mutex + * across the reserve/allocate/unreserve calls. If there are delalloc + * blocks in the range, they are ours. + */ + if (start_fsb < end_fsb) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (error && !XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert(mp, "%s: unable to clean up ino %lld", + __func__, ip->i_ino); + return error; + } + } + + return 0; +} + +static int +xfs_file_iomap_end( + struct inode *inode, + loff_t offset, + loff_t length, + ssize_t written, + unsigned flags, + struct iomap *iomap) +{ + if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) + return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, + length, written); + return 0; +} + +struct iomap_ops xfs_iomap_ops = { + .iomap_begin = xfs_file_iomap_begin, + .iomap_end = xfs_file_iomap_end, +}; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 8688e663d744..e066d045e2ff 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,6 +18,8 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ +#include <linux/iomap.h> + struct xfs_inode; struct xfs_bmbt_irec; @@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *); + +extern struct iomap_ops xfs_iomap_ops; + #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c5d4eba6972e..ab820f84ed50 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,12 +38,13 @@ #include "xfs_dir2.h" #include "xfs_trans_space.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/capability.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/security.h> -#include <linux/fiemap.h> +#include <linux/iomap.h> #include <linux/slab.h> /* @@ -801,20 +802,30 @@ xfs_setattr_size( return error; /* + * Wait for all direct I/O to complete. + */ + inode_dio_wait(inode); + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a * part of the transaction. * - * Start with zeroing any data block beyond EOF that we may expose on - * file extension. + * Start with zeroing any data beyond EOF that we may expose on file + * extension, or zeroing out the rest of the block on a downward + * truncate. */ if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); - if (error) - return error; + } else { + error = iomap_truncate_page(inode, newsize, &did_zeroing, + &xfs_iomap_ops); } + if (error) + return error; + /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new @@ -823,17 +834,14 @@ xfs_setattr_size( * problem. Note that this includes any block zeroing we did above; * otherwise those blocks may not be zeroed after a crash. */ - if (newsize > ip->i_d.di_size && - (oldsize != ip->i_d.di_size || did_zeroing)) { + if (did_zeroing || + (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } - /* Now wait for all direct I/O to complete. */ - inode_dio_wait(inode); - /* * We've already locked out new page faults, so now we can safely remove * pages from the page cache knowing they won't get refaulted until we @@ -851,13 +859,6 @@ xfs_setattr_size( * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - if (IS_DAX(inode)) - error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); - else - error = block_truncate_page(inode->i_mapping, newsize, - xfs_get_blocks); - if (error) - return error; truncate_setsize(inode, newsize); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); @@ -998,51 +999,6 @@ xfs_vn_update_time( return xfs_trans_commit(tp); } -#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) - -/* - * Call fiemap helper to fill in user data. - * Returns positive errors to xfs_getbmap. - */ -STATIC int -xfs_fiemap_format( - void **arg, - struct getbmapx *bmv, - int *full) -{ - int error; - struct fiemap_extent_info *fieinfo = *arg; - u32 fiemap_flags = 0; - u64 logical, physical, length; - - /* Do nothing for a hole */ - if (bmv->bmv_block == -1LL) - return 0; - - logical = BBTOB(bmv->bmv_offset); - physical = BBTOB(bmv->bmv_block); - length = BBTOB(bmv->bmv_length); - - if (bmv->bmv_oflags & BMV_OF_PREALLOC) - fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; - else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { - fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - physical = 0; /* no block yet */ - } - if (bmv->bmv_oflags & BMV_OF_LAST) - fiemap_flags |= FIEMAP_EXTENT_LAST; - - error = fiemap_fill_next_extent(fieinfo, logical, physical, - length, fiemap_flags); - if (error > 0) { - error = 0; - *full = 1; /* user array now full */ - } - - return error; -} - STATIC int xfs_vn_fiemap( struct inode *inode, @@ -1050,38 +1006,13 @@ xfs_vn_fiemap( u64 start, u64 length) { - xfs_inode_t *ip = XFS_I(inode); - struct getbmapx bm; int error; - error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); - if (error) - return error; - - /* Set up bmap header for xfs internal routine */ - bm.bmv_offset = BTOBBT(start); - /* Special case for whole file */ - if (length == FIEMAP_MAX_OFFSET) - bm.bmv_length = -1LL; - else - bm.bmv_length = BTOBB(start + length) - bm.bmv_offset; - - /* We add one because in getbmap world count includes the header */ - bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : - fieinfo->fi_extents_max + 1; - bm.bmv_count = min_t(__s32, bm.bmv_count, - (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; - if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) - bm.bmv_iflags |= BMV_IF_ATTRFORK; - if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) - bm.bmv_iflags |= BMV_IF_DELALLOC; - - error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); - if (error) - return error; + xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED); + error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops); + xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); - return 0; + return error; } STATIC int diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bde02f1fba73..63dad9ea9016 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -788,7 +788,7 @@ xfs_log_mount_cancel( * As far as I know, there weren't any dependencies on the old behaviour. */ -int +static int xfs_log_unmount_write(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -1036,7 +1036,7 @@ xfs_log_space_wake( * there's no point in running a dummy transaction at this point because we * can't start trying to idle the log until both the CIL and AIL are empty. */ -int +static int xfs_log_need_covered(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -1177,7 +1177,7 @@ xlog_space_left( * The log manager needs its own routine, in order to control what * happens with the buffer after the write completes. */ -void +static void xlog_iodone(xfs_buf_t *bp) { struct xlog_in_core *iclog = bp->b_fspriv; @@ -1302,7 +1302,7 @@ xfs_log_work_queue( * disk. If there is nothing dirty, then we might need to cover the log to * indicate that the filesystem is idle. */ -void +static void xfs_log_worker( struct work_struct *work) { diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 80ba0c047090..b5e71072fde5 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -163,12 +163,8 @@ int xfs_log_reserve(struct xfs_mount *mp, __uint8_t clientid, bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); -int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); -int xfs_log_need_covered(struct xfs_mount *mp); - -void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); @@ -178,7 +174,6 @@ void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); -void xfs_log_worker(struct work_struct *work); void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index d5b756669fb5..0f14b2e4bf6c 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2014 Christoph Hellwig. */ +#include <linux/iomap.h> #include "xfs.h" #include "xfs_format.h" #include "xfs_log_format.h" @@ -79,32 +80,6 @@ xfs_fs_get_uuid( return 0; } -static void -xfs_bmbt_to_iomap( - struct xfs_inode *ip, - struct iomap *iomap, - struct xfs_bmbt_irec *imap) -{ - struct xfs_mount *mp = ip->i_mount; - - if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_DELALLOC; - } else { - iomap->blkno = - XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); - if (imap->br_state == XFS_EXT_UNWRITTEN) - iomap->type = IOMAP_UNWRITTEN; - else - iomap->type = IOMAP_MAPPED; - } - iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); -} - /* * Get a layout for the pNFS client. */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 76c0a4a9bb17..355dd9e1cb64 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -98,8 +98,6 @@ xfs_growfs_rt( /* * From xfs_rtbitmap.c */ -int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t block, int issum, struct xfs_buf **bpp); int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, int val, xfs_rtblock_t *new, int *stat); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 5f3c7299532b..258b594f5e61 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -546,7 +546,7 @@ xfs_showargs( return 0; } -__uint64_t +static __uint64_t xfs_max_file_offset( unsigned int blockshift) { diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 2dfb1ce4585f..529bce9fc37e 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -61,8 +61,6 @@ struct xfs_mount; struct xfs_buftarg; struct block_device; -extern __uint64_t xfs_max_file_offset(unsigned int); - extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 68f27f70e1ed..6787a9f96526 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1296,6 +1296,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); +DEFINE_IOMAP_EVENT(xfs_iomap_alloc); +DEFINE_IOMAP_EVENT(xfs_iomap_found); +DEFINE_IOMAP_EVENT(xfs_iomap_not_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), |