diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/afs/rxrpc.c | 11 | ||||
-rw-r--r-- | fs/dax.c | 60 | ||||
-rw-r--r-- | fs/exec.c | 5 | ||||
-rw-r--r-- | fs/iomap.c | 53 | ||||
-rw-r--r-- | fs/nfs/callback_proc.c | 22 | ||||
-rw-r--r-- | fs/nfs/flexfilelayout/flexfilelayout.c | 21 | ||||
-rw-r--r-- | fs/nfs/flexfilelayout/flexfilelayout.h | 4 | ||||
-rw-r--r-- | fs/nfs/flexfilelayout/flexfilelayoutdev.c | 19 | ||||
-rw-r--r-- | fs/nfs/nfs42proc.c | 19 | ||||
-rw-r--r-- | fs/nfs/nfs4_fs.h | 2 | ||||
-rw-r--r-- | fs/nfs/nfs4state.c | 16 | ||||
-rw-r--r-- | fs/nilfs2/btnode.c | 4 | ||||
-rw-r--r-- | fs/read_write.c | 15 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_bmap.c | 5 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ialloc_btree.c | 11 | ||||
-rw-r--r-- | fs/xfs/xfs_bmap_util.c | 10 | ||||
-rw-r--r-- | fs/xfs/xfs_bmap_util.h | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_item.c | 28 | ||||
-rw-r--r-- | fs/xfs/xfs_file.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_reflink.c | 18 | ||||
-rw-r--r-- | fs/xfs/xfs_trace.h | 5 |
21 files changed, 219 insertions, 114 deletions
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 59970886690f..a7b44863d502 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -576,6 +576,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, { signed long rtt2, timeout; long ret; + bool stalled = false; u64 rtt; u32 life, last_life; @@ -609,12 +610,20 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); if (timeout == 0 && - life == last_life && signal_pending(current)) + life == last_life && signal_pending(current)) { + if (stalled) break; + __set_current_state(TASK_RUNNING); + rxrpc_kernel_probe_life(call->net->socket, call->rxcall); + timeout = rtt2; + stalled = true; + continue; + } if (life != last_life) { timeout = rtt2; last_life = life; + stalled = false; } timeout = schedule_timeout(timeout); @@ -98,12 +98,6 @@ static void *dax_make_entry(pfn_t pfn, unsigned long flags) return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); } -static void *dax_make_page_entry(struct page *page) -{ - pfn_t pfn = page_to_pfn_t(page); - return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0); -} - static bool dax_is_locked(void *entry) { return xa_to_value(entry) & DAX_LOCKED; @@ -116,12 +110,12 @@ static unsigned int dax_entry_order(void *entry) return 0; } -static int dax_is_pmd_entry(void *entry) +static unsigned long dax_is_pmd_entry(void *entry) { return xa_to_value(entry) & DAX_PMD; } -static int dax_is_pte_entry(void *entry) +static bool dax_is_pte_entry(void *entry) { return !(xa_to_value(entry) & DAX_PMD); } @@ -222,9 +216,8 @@ static void *get_unlocked_entry(struct xa_state *xas) ewait.wait.func = wake_exceptional_entry_func; for (;;) { - entry = xas_load(xas); - if (!entry || xa_is_internal(entry) || - WARN_ON_ONCE(!xa_is_value(entry)) || + entry = xas_find_conflict(xas); + if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || !dax_is_locked(entry)) return entry; @@ -255,6 +248,7 @@ static void dax_unlock_entry(struct xa_state *xas, void *entry) { void *old; + BUG_ON(dax_is_locked(entry)); xas_reset(xas); xas_lock_irq(xas); old = xas_store(xas, entry); @@ -352,16 +346,27 @@ static struct page *dax_busy_page(void *entry) return NULL; } +/* + * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page + * @page: The page whose entry we want to lock + * + * Context: Process context. + * Return: %true if the entry was locked or does not need to be locked. + */ bool dax_lock_mapping_entry(struct page *page) { XA_STATE(xas, NULL, 0); void *entry; + bool locked; + /* Ensure page->mapping isn't freed while we look at it */ + rcu_read_lock(); for (;;) { struct address_space *mapping = READ_ONCE(page->mapping); + locked = false; if (!dax_mapping(mapping)) - return false; + break; /* * In the device-dax case there's no need to lock, a @@ -370,8 +375,9 @@ bool dax_lock_mapping_entry(struct page *page) * otherwise we would not have a valid pfn_to_page() * translation. */ + locked = true; if (S_ISCHR(mapping->host->i_mode)) - return true; + break; xas.xa = &mapping->i_pages; xas_lock_irq(&xas); @@ -382,28 +388,35 @@ bool dax_lock_mapping_entry(struct page *page) xas_set(&xas, page->index); entry = xas_load(&xas); if (dax_is_locked(entry)) { + rcu_read_unlock(); entry = get_unlocked_entry(&xas); - /* Did the page move while we slept? */ - if (dax_to_pfn(entry) != page_to_pfn(page)) { - xas_unlock_irq(&xas); - continue; - } + xas_unlock_irq(&xas); + put_unlocked_entry(&xas, entry); + rcu_read_lock(); + continue; } dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); - return true; + break; } + rcu_read_unlock(); + return locked; } void dax_unlock_mapping_entry(struct page *page) { struct address_space *mapping = page->mapping; XA_STATE(xas, &mapping->i_pages, page->index); + void *entry; if (S_ISCHR(mapping->host->i_mode)) return; - dax_unlock_entry(&xas, dax_make_page_entry(page)); + rcu_read_lock(); + entry = xas_load(&xas); + rcu_read_unlock(); + entry = dax_make_entry(page_to_pfn_t(page), dax_is_pmd_entry(entry)); + dax_unlock_entry(&xas, entry); } /* @@ -445,11 +458,9 @@ static void *grab_mapping_entry(struct xa_state *xas, retry: xas_lock_irq(xas); entry = get_unlocked_entry(xas); - if (xa_is_internal(entry)) - goto fallback; if (entry) { - if (WARN_ON_ONCE(!xa_is_value(entry))) { + if (!xa_is_value(entry)) { xas_set_err(xas, EIO); goto out_unlock; } @@ -1628,8 +1639,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /* Did we race with someone splitting entry or so? */ if (!entry || (order == 0 && !dax_is_pte_entry(entry)) || - (order == PMD_ORDER && (xa_is_internal(entry) || - !dax_is_pmd_entry(entry)))) { + (order == PMD_ORDER && !dax_is_pmd_entry(entry))) { put_unlocked_entry(&xas, entry); xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, diff --git a/fs/exec.c b/fs/exec.c index fc281b738a98..acc3a5536384 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -62,6 +62,7 @@ #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> +#include <linux/freezer.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -1083,7 +1084,7 @@ static int de_thread(struct task_struct *tsk) while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); - schedule(); + freezable_schedule(); if (unlikely(__fatal_signal_pending(tsk))) goto killed; spin_lock_irq(lock); @@ -1111,7 +1112,7 @@ static int de_thread(struct task_struct *tsk) __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); - schedule(); + freezable_schedule(); if (unlikely(__fatal_signal_pending(tsk))) goto killed; } diff --git a/fs/iomap.c b/fs/iomap.c index 64ce240217a1..3ffb776fbebe 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -142,13 +142,14 @@ static void iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) { + loff_t orig_pos = *pos; + loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; unsigned block_size = (1 << block_bits); unsigned poff = offset_in_page(*pos); unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); unsigned first = poff >> block_bits; unsigned last = (poff + plen - 1) >> block_bits; - unsigned end = offset_in_page(i_size_read(inode)) >> block_bits; /* * If the block size is smaller than the page size we need to check the @@ -183,8 +184,12 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ - if (first <= end && last > end) - plen -= (last - end) * block_size; + if (orig_pos <= isize && orig_pos + length > isize) { + unsigned end = offset_in_page(isize - 1) >> block_bits; + + if (first <= end && last > end) + plen -= (last - end) * block_size; + } *offp = poff; *lenp = plen; @@ -1580,7 +1585,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, struct bio *bio; bool need_zeroout = false; bool use_fua = false; - int nr_pages, ret; + int nr_pages, ret = 0; size_t copied = 0; if ((pos | length | align) & ((1 << blkbits) - 1)) @@ -1596,12 +1601,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, if (iomap->flags & IOMAP_F_NEW) { need_zeroout = true; - } else { + } else if (iomap->type == IOMAP_MAPPED) { /* - * Use a FUA write if we need datasync semantics, this - * is a pure data IO that doesn't require any metadata - * updates and the underlying device supports FUA. This - * allows us to avoid cache flushes on IO completion. + * Use a FUA write if we need datasync semantics, this is a pure + * data IO that doesn't require any metadata updates (including + * after IO completion such as unwritten extent conversion) and + * the underlying device supports FUA. This allows us to avoid + * cache flushes on IO completion. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && (dio->flags & IOMAP_DIO_WRITE_FUA) && @@ -1644,8 +1650,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, ret = bio_iov_iter_get_pages(bio, &iter); if (unlikely(ret)) { + /* + * We have to stop part way through an IO. We must fall + * through to the sub-block tail zeroing here, otherwise + * this short IO may expose stale data in the tail of + * the block we haven't written data to. + */ bio_put(bio); - return copied ? copied : ret; + goto zero_tail; } n = bio->bi_iter.bi_size; @@ -1676,13 +1688,21 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, dio->submit.cookie = submit_bio(bio); } while (nr_pages); - if (need_zeroout) { + /* + * We need to zeroout the tail of a sub-block write if the extent type + * requires zeroing or the write extends beyond EOF. If we don't zero + * the block tail in the latter case, we can expose stale data via mmap + * reads of the EOF block. + */ +zero_tail: + if (need_zeroout || + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); if (pad) iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); } - return copied; + return copied ? copied : ret; } static loff_t @@ -1857,6 +1877,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->wait_for_completion = true; ret = 0; } + + /* + * Splicing to pipes can fail on a full pipe. We have to + * swallow this to make it look like a short IO + * otherwise the higher splice layers will completely + * mishandle the error and stop moving data. + */ + if (ret == -EFAULT) + ret = 0; break; } pos += ret; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 7b861bbc0b43..315967354954 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -686,20 +686,24 @@ __be32 nfs4_callback_offload(void *data, void *dummy, { struct cb_offloadargs *args = data; struct nfs_server *server; - struct nfs4_copy_state *copy; + struct nfs4_copy_state *copy, *tmp_copy; bool found = false; + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); + if (!copy) + return htonl(NFS4ERR_SERVERFAULT); + spin_lock(&cps->clp->cl_lock); rcu_read_lock(); list_for_each_entry_rcu(server, &cps->clp->cl_superblocks, client_link) { - list_for_each_entry(copy, &server->ss_copies, copies) { + list_for_each_entry(tmp_copy, &server->ss_copies, copies) { if (memcmp(args->coa_stateid.other, - copy->stateid.other, + tmp_copy->stateid.other, sizeof(args->coa_stateid.other))) continue; - nfs4_copy_cb_args(copy, args); - complete(©->completion); + nfs4_copy_cb_args(tmp_copy, args); + complete(&tmp_copy->completion); found = true; goto out; } @@ -707,15 +711,11 @@ __be32 nfs4_callback_offload(void *data, void *dummy, out: rcu_read_unlock(); if (!found) { - copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); - if (!copy) { - spin_unlock(&cps->clp->cl_lock); - return htonl(NFS4ERR_SERVERFAULT); - } memcpy(©->stateid, &args->coa_stateid, NFS4_STATEID_SIZE); nfs4_copy_cb_args(copy, args); list_add_tail(©->copies, &cps->clp->pending_cb_stateids); - } + } else + kfree(copy); spin_unlock(&cps->clp->cl_lock); return 0; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 86bcba40ca61..74b36ed883ca 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1361,12 +1361,7 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) task)) return; - if (ff_layout_read_prepare_common(task, hdr)) - return; - - if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, - hdr->args.lock_context, FMODE_READ) == -EIO) - rpc_exit(task, -EIO); /* lost lock, terminate I/O */ + ff_layout_read_prepare_common(task, hdr); } static void ff_layout_read_call_done(struct rpc_task *task, void *data) @@ -1542,12 +1537,7 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) task)) return; - if (ff_layout_write_prepare_common(task, hdr)) - return; - - if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, - hdr->args.lock_context, FMODE_WRITE) == -EIO) - rpc_exit(task, -EIO); /* lost lock, terminate I/O */ + ff_layout_write_prepare_common(task, hdr); } static void ff_layout_write_call_done(struct rpc_task *task, void *data) @@ -1742,6 +1732,10 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) fh = nfs4_ff_layout_select_ds_fh(lseg, idx); if (fh) hdr->args.fh = fh; + + if (!nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) + goto out_failed; + /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. @@ -1804,6 +1798,9 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) if (fh) hdr->args.fh = fh; + if (!nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) + goto out_failed; + /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 411798346e48..de50a342d5a5 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -215,6 +215,10 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, unsigned int maxnum); struct nfs_fh * nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); +int +nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, + u32 mirror_idx, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 74d8d5352438..d23347389626 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -370,6 +370,25 @@ out: return fh; } +int +nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, + u32 mirror_idx, + nfs4_stateid *stateid) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); + + if (!ff_layout_mirror_valid(lseg, mirror, false)) { + pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", + __func__, mirror_idx); + goto out; + } + + nfs4_stateid_copy(stateid, &mirror->stateid); + return 1; +out: + return 0; +} + /** * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index ac5b784a1de0..fed06fd9998d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -137,31 +137,32 @@ static int handle_async_copy(struct nfs42_copy_res *res, struct file *dst, nfs4_stateid *src_stateid) { - struct nfs4_copy_state *copy; + struct nfs4_copy_state *copy, *tmp_copy; int status = NFS4_OK; bool found_pending = false; struct nfs_open_context *ctx = nfs_file_open_context(dst); + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); + if (!copy) + return -ENOMEM; + spin_lock(&server->nfs_client->cl_lock); - list_for_each_entry(copy, &server->nfs_client->pending_cb_stateids, + list_for_each_entry(tmp_copy, &server->nfs_client->pending_cb_stateids, copies) { - if (memcmp(&res->write_res.stateid, ©->stateid, + if (memcmp(&res->write_res.stateid, &tmp_copy->stateid, NFS4_STATEID_SIZE)) continue; found_pending = true; - list_del(©->copies); + list_del(&tmp_copy->copies); break; } if (found_pending) { spin_unlock(&server->nfs_client->cl_lock); + kfree(copy); + copy = tmp_copy; goto out; } - copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); - if (!copy) { - spin_unlock(&server->nfs_client->cl_lock); - return -ENOMEM; - } memcpy(©->stateid, &res->write_res.stateid, NFS4_STATEID_SIZE); init_completion(©->completion); copy->parent_state = ctx->state; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8d59c9655ec4..1b994b527518 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -41,6 +41,8 @@ enum nfs4_client_state { NFS4CLNT_MOVED, NFS4CLNT_LEASE_MOVED, NFS4CLNT_DELEGATION_EXPIRED, + NFS4CLNT_RUN_MANAGER, + NFS4CLNT_DELEGRETURN_RUNNING, }; #define NFS4_RENEW_TIMEOUT 0x01 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ffea57885394..d8decf2ec48f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1210,6 +1210,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; __module_get(THIS_MODULE); @@ -2503,6 +2504,7 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Ensure exclusive access to NFSv4 state */ do { + clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { section = "purge state"; status = nfs4_purge_lease(clp); @@ -2593,14 +2595,18 @@ static void nfs4_state_manager(struct nfs_client *clp) } nfs4_end_drain_session(clp); - if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { - nfs_client_return_marked_delegations(clp); - continue; + nfs4_clear_state_manager_bit(clp); + + if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) { + if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { + nfs_client_return_marked_delegations(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + } + clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state); } - nfs4_clear_state_manager_bit(clp); /* Did we race with an attempt to give us more work? */ - if (clp->cl_state == 0) + if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) return; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index de99db518571..f2129a5d9f23 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -266,9 +266,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, return; if (nbh == NULL) { /* blocksize == pagesize */ - xa_lock_irq(&btnc->i_pages); - __xa_erase(&btnc->i_pages, newkey); - xa_unlock_irq(&btnc->i_pages); + xa_erase_irq(&btnc->i_pages, newkey); unlock_page(ctxt->bh->b_page); } else brelse(nbh); diff --git a/fs/read_write.c b/fs/read_write.c index bfcb4ced5664..4dae0399c75a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -2094,17 +2094,18 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) off = same->src_offset; len = same->src_length; - ret = -EISDIR; if (S_ISDIR(src->i_mode)) - goto out; + return -EISDIR; - ret = -EINVAL; if (!S_ISREG(src->i_mode)) - goto out; + return -EINVAL; + + if (!file->f_op->remap_file_range) + return -EOPNOTSUPP; ret = remap_verify_area(file, off, len, false); if (ret < 0) - goto out; + return ret; ret = 0; if (off + len > i_size_read(src)) @@ -2147,10 +2148,8 @@ next_fdput: fdput(dst_fd); next_loop: if (fatal_signal_pending(current)) - goto out; + break; } - -out: return ret; } EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 74d7228e755b..19e921d1586f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1694,10 +1694,13 @@ xfs_bmap_add_extent_delay_real( case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. - * The right neighbor is contiguous, the left is not. + * The right neighbor is contiguous, the left is not. Take care + * with delay -> unwritten extent allocation here because the + * delalloc record we are overwriting is always written. */ PREV.br_startblock = new->br_startblock; PREV.br_blockcount += RIGHT.br_blockcount; + PREV.br_state = new->br_state; xfs_iext_next(ifp, &bma->icur); xfs_iext_remove(bma->ip, &bma->icur, state); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 86c50208a143..7fbf8af0b159 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -538,15 +538,18 @@ xfs_inobt_rec_check_count( static xfs_extlen_t xfs_inobt_max_size( - struct xfs_mount *mp) + struct xfs_mount *mp, + xfs_agnumber_t agno) { + xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); + /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_inobt_mxr[0] == 0) return 0; return xfs_btree_calc_size(mp->m_inobt_mnr, - (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / - XFS_INODES_PER_CHUNK); + (uint64_t)agblocks * mp->m_sb.sb_inopblock / + XFS_INODES_PER_CHUNK); } static int @@ -594,7 +597,7 @@ xfs_finobt_calc_reserves( if (error) return error; - *ask += xfs_inobt_max_size(mp); + *ask += xfs_inobt_max_size(mp, agno); *used += tree_len; return 0; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5d263dfdb3bc..404e581f1ea1 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1042,7 +1042,7 @@ out_trans_cancel: goto out_unlock; } -static int +int xfs_flush_unmap_range( struct xfs_inode *ip, xfs_off_t offset, @@ -1195,13 +1195,7 @@ xfs_prepare_shift( * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1); - if (error) - return error; - error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - offset >> PAGE_SHIFT, -1); - if (error) - return error; + error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip)); /* * Clean out anything hanging around in the cow fork now that diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 87363d136bb6..7a78229cf1a7 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -80,4 +80,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_extnum_t *nextents, xfs_filblks_t *count); +int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); + #endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 12d8455bfbb2..010db5f8fb00 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1233,9 +1233,23 @@ xfs_buf_iodone( } /* - * Requeue a failed buffer for writeback + * Requeue a failed buffer for writeback. * - * Return true if the buffer has been re-queued properly, false otherwise + * We clear the log item failed state here as well, but we have to be careful + * about reference counts because the only active reference counts on the buffer + * may be the failed log items. Hence if we clear the log item failed state + * before queuing the buffer for IO we can release all active references to + * the buffer and free it, leading to use after free problems in + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which + * order we process them in - the buffer is locked, and we own the buffer list + * so nothing on them is going to change while we are performing this action. + * + * Hence we can safely queue the buffer for IO before we clear the failed log + * item state, therefore always having an active reference to the buffer and + * avoiding the transient zero-reference state that leads to use-after-free. + * + * Return true if the buffer was added to the buffer list, false if it was + * already on the buffer list. */ bool xfs_buf_resubmit_failed_buffers( @@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers( struct list_head *buffer_list) { struct xfs_log_item *lip; + bool ret; + + ret = xfs_buf_delwri_queue(bp, buffer_list); /* - * Clear XFS_LI_FAILED flag from all items before resubmit - * - * XFS_LI_FAILED set/clear is protected by ail_lock, caller this + * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this * function already have it acquired */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) xfs_clear_li_failed(lip); - /* Add this buffer back to the delayed write list */ - return xfs_buf_delwri_queue(bp, buffer_list); + return ret; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 53c9ab8fb777..e47425071e65 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -920,7 +920,7 @@ out_unlock: } -loff_t +STATIC loff_t xfs_file_remap_range( struct file *file_in, loff_t pos_in, diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ecdb086bc23e..322a852ce284 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -296,6 +296,7 @@ xfs_reflink_reserve_cow( if (error) return error; + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); trace_xfs_reflink_cow_alloc(ip, &got); return 0; } @@ -1351,10 +1352,19 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; - /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, - round_down(pos_out, PAGE_SIZE), - round_up(pos_out + *len, PAGE_SIZE) - 1); + /* + * If pos_out > EOF, we may have dirtied blocks between EOF and + * pos_out. In that case, we need to extend the flush and unmap to cover + * from EOF to the end of the copy length. + */ + if (pos_out > XFS_ISIZE(dest)) { + loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); + ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); + } else { + ret = xfs_flush_unmap_range(dest, pos_out, *len); + } + if (ret) + goto out_unlock; return 1; out_unlock: diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3043e5ed6495..8a6532aae779 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -280,7 +280,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; + if (bp->b_bn == XFS_BUF_DADDR_NULL) + __entry->bno = bp->b_maps[0].bm_bn; + else + __entry->bno = bp->b_bn; __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); |