diff options
author | Arnaldo Carvalho de Melo <acme@redhat.com> | 2021-03-08 14:11:33 +0100 |
---|---|---|
committer | Arnaldo Carvalho de Melo <acme@redhat.com> | 2021-03-08 14:11:33 +0100 |
commit | 009ef05f98129aa91c62c3baab859ba593a15bb2 (patch) | |
tree | f3414f08d636a597545b1e4f443b373b9d6d8f4b /mm/filemap.c | |
parent | perf annotate: Show full source location with 'l' hotkey (diff) | |
parent | Merge tag 'perf-tools-fixes-for-v5.12-2020-03-07' of git://git.kernel.org/pub... (diff) | |
download | linux-009ef05f98129aa91c62c3baab859ba593a15bb2.tar.xz linux-009ef05f98129aa91c62c3baab859ba593a15bb2.zip |
Merge remote-tracking branch 'torvalds/master' into perf/core
To pick up the fixes sent for v5.12 and continue development based on
v5.12-rc2, i.e. without the swap on file bug.
This also gets a slightly newer and better tools/perf/arch/arm/util/cs-etm.c
patch version, using the BIT() macro, that had already been slated to
v5.13 but ended up going to v5.12-rc1 on an older version.
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 926 |
1 files changed, 499 insertions, 427 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 6ff2a3fb0dc7..43700480d897 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -206,9 +206,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, if (PageSwapBacked(page)) { __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) - __dec_lruvec_page_state(page, NR_SHMEM_THPS); + __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr); } else if (PageTransHuge(page)) { - __dec_lruvec_page_state(page, NR_FILE_THPS); + __mod_lruvec_page_state(page, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } @@ -777,7 +777,6 @@ EXPORT_SYMBOL(file_write_and_wait_range); * replace_page_cache_page - replace a pagecache page with a new one * @old: page to be replaced * @new: page to replace with - * @gfp_mask: allocation mode * * This function replaces a page in the pagecache with a new one. On * success it acquires the pagecache reference for the new page and @@ -786,10 +785,8 @@ EXPORT_SYMBOL(file_write_and_wait_range); * caller must do that. * * The remove + add is atomic. This function cannot fail. - * - * Return: %0 */ -int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) +void replace_page_cache_page(struct page *old, struct page *new) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *) = mapping->a_ops->freepage; @@ -824,8 +821,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (freepage) freepage(old); put_page(old); - - return 0; } EXPORT_SYMBOL_GPL(replace_page_cache_page); @@ -1348,61 +1343,26 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) } EXPORT_SYMBOL(wait_on_page_bit_killable); -static int __wait_on_page_locked_async(struct page *page, - struct wait_page_queue *wait, bool set) -{ - struct wait_queue_head *q = page_waitqueue(page); - int ret = 0; - - wait->page = page; - wait->bit_nr = PG_locked; - - spin_lock_irq(&q->lock); - __add_wait_queue_entry_tail(q, &wait->wait); - SetPageWaiters(page); - if (set) - ret = !trylock_page(page); - else - ret = PageLocked(page); - /* - * If we were successful now, we know we're still on the - * waitqueue as we're still under the lock. This means it's - * safe to remove and return success, we know the callback - * isn't going to trigger. - */ - if (!ret) - __remove_wait_queue(q, &wait->wait); - else - ret = -EIOCBQUEUED; - spin_unlock_irq(&q->lock); - return ret; -} - -static int wait_on_page_locked_async(struct page *page, - struct wait_page_queue *wait) -{ - if (!PageLocked(page)) - return 0; - return __wait_on_page_locked_async(compound_head(page), wait, false); -} - /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. + * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * * The caller should hold a reference on @page. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration * (for example) by holding the reference while waiting for the page to * come unlocked. After this function returns, the caller should not * dereference @page. + * + * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal. */ -void put_and_wait_on_page_locked(struct page *page) +int put_and_wait_on_page_locked(struct page *page, int state) { wait_queue_head_t *q; page = compound_head(page); q = page_waitqueue(page); - wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP); + return wait_on_page_bit_common(q, page, PG_locked, state, DROP); } /** @@ -1558,7 +1518,28 @@ EXPORT_SYMBOL_GPL(__lock_page_killable); int __lock_page_async(struct page *page, struct wait_page_queue *wait) { - return __wait_on_page_locked_async(page, wait, true); + struct wait_queue_head *q = page_waitqueue(page); + int ret = 0; + + wait->page = page; + wait->bit_nr = PG_locked; + + spin_lock_irq(&q->lock); + __add_wait_queue_entry_tail(q, &wait->wait); + SetPageWaiters(page); + ret = !trylock_page(page); + /* + * If we were successful now, we know we're still on the + * waitqueue as we're still under the lock. This means it's + * safe to remove and return success, we know the callback + * isn't going to trigger. + */ + if (!ret) + __remove_wait_queue(q, &wait->wait); + else + ret = -EIOCBQUEUED; + spin_unlock_irq(&q->lock); + return ret; } /* @@ -1677,8 +1658,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, } EXPORT_SYMBOL(page_cache_prev_miss); -/** - * find_get_entry - find and get a page cache entry +/* + * mapping_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * @@ -1690,7 +1671,8 @@ EXPORT_SYMBOL(page_cache_prev_miss); * * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_get_entry(struct address_space *mapping, pgoff_t index) +static struct page *mapping_get_entry(struct address_space *mapping, + pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct page *page; @@ -1727,39 +1709,6 @@ out: } /** - * find_lock_entry - Locate and lock a page cache entry. - * @mapping: The address_space to search. - * @index: The page cache index. - * - * Looks up the page at @mapping & @index. If there is a page in the - * cache, the head page is returned locked and with an increased refcount. - * - * If the slot holds a shadow entry of a previously evicted page, or a - * swap entry from shmem/tmpfs, it is returned. - * - * Context: May sleep. - * Return: The head page or shadow entry, %NULL if nothing is found. - */ -struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) -{ - struct page *page; - -repeat: - page = find_get_entry(mapping, index); - if (page && !xa_is_value(page)) { - lock_page(page); - /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto repeat; - } - VM_BUG_ON_PAGE(!thp_contains(page, index), page); - } - return page; -} - -/** * pagecache_get_page - Find and get a reference to a page. * @mapping: The address_space to search. * @index: The page index. @@ -1774,6 +1723,8 @@ repeat: * * %FGP_LOCK - The page is returned locked. * * %FGP_HEAD - If the page is present and a THP, return the head page * rather than the exact page specified by the index. + * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it + * instead of allocating a new page to replace it. * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. @@ -1797,9 +1748,12 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, struct page *page; repeat: - page = find_get_entry(mapping, index); - if (xa_is_value(page)) + page = mapping_get_entry(mapping, index); + if (xa_is_value(page)) { + if (fgp_flags & FGP_ENTRY) + return page; page = NULL; + } if (!page) goto no_page; @@ -1871,18 +1825,53 @@ no_page: } EXPORT_SYMBOL(pagecache_get_page); +static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, + xa_mark_t mark) +{ + struct page *page; + +retry: + if (mark == XA_PRESENT) + page = xas_find(xas, max); + else + page = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, page)) + goto retry; + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (!page || xa_is_value(page)) + return page; + + if (!page_cache_get_speculative(page)) + goto reset; + + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(xas))) { + put_page(page); + goto reset; + } + + return page; +reset: + xas_reset(xas); + goto retry; +} + /** * find_get_entries - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page cache index - * @nr_entries: The maximum number of entries - * @entries: Where the resulting entries are placed + * @end: The final page index (inclusive). + * @pvec: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * - * find_get_entries() will search for and return a group of up to - * @nr_entries entries in the mapping. The entries are placed at - * @entries. find_get_entries() takes a reference against any actual - * pages it returns. + * find_get_entries() will search for and return a batch of entries in + * the mapping. The entries are placed in @pvec. find_get_entries() + * takes a reference on any actual pages it returns. * * The search returns a group of mapping-contiguous page cache entries * with ascending indexes. There may be holes in the indices due to @@ -1898,60 +1887,97 @@ EXPORT_SYMBOL(pagecache_get_page); * * Return: the number of pages and shadow entries which were found. */ -unsigned find_get_entries(struct address_space *mapping, - pgoff_t start, unsigned int nr_entries, - struct page **entries, pgoff_t *indices) +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct pagevec *pvec, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned int ret = 0; - - if (!nr_entries) - return 0; + unsigned nr_entries = PAGEVEC_SIZE; rcu_read_lock(); - xas_for_each(&xas, page, ULONG_MAX) { - if (xas_retry(&xas, page)) - continue; - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ - if (xa_is_value(page)) - goto export; - - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { /* * Terminate early on finding a THP, to allow the caller to * handle it all at once; but continue if this is hugetlbfs. */ - if (PageTransHuge(page) && !PageHuge(page)) { + if (!xa_is_value(page) && PageTransHuge(page) && + !PageHuge(page)) { page = find_subpage(page, xas.xa_index); nr_entries = ret + 1; } -export: + indices[ret] = xas.xa_index; - entries[ret] = page; + pvec->pages[ret] = page; if (++ret == nr_entries) break; - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } rcu_read_unlock(); + + pvec->nr = ret; return ret; } /** + * find_lock_entries - Find a batch of pagecache entries. + * @mapping: The address_space to search. + * @start: The starting page cache index. + * @end: The final page index (inclusive). + * @pvec: Where the resulting entries are placed. + * @indices: The cache indices of the entries in @pvec. + * + * find_lock_entries() will return a batch of entries from @mapping. + * Swap, shadow and DAX entries are included. Pages are returned + * locked and with an incremented refcount. Pages which are locked by + * somebody else or under writeback are skipped. Only the head page of + * a THP is returned. Pages which are partially outside the range are + * not returned. + * + * The entries have ascending indexes. The indices may not be consecutive + * due to not-present entries, THP pages, pages which could not be locked + * or pages under writeback. + * + * Return: The number of entries which were found. + */ +unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct pagevec *pvec, pgoff_t *indices) +{ + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; + + rcu_read_lock(); + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { + if (!xa_is_value(page)) { + if (page->index < start) + goto put; + VM_BUG_ON_PAGE(page->index != xas.xa_index, page); + if (page->index + thp_nr_pages(page) - 1 > end) + goto put; + if (!trylock_page(page)) + goto put; + if (page->mapping != mapping || PageWriteback(page)) + goto unlock; + VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index), + page); + } + indices[pvec->nr] = xas.xa_index; + if (!pagevec_add(pvec, page)) + break; + goto next; +unlock: + unlock_page(page); +put: + put_page(page); +next: + if (!xa_is_value(page) && PageTransHuge(page)) + xas_set(&xas, page->index + thp_nr_pages(page)); + } + rcu_read_unlock(); + + return pagevec_count(pvec); +} + +/** * find_get_pages_range - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page index @@ -1984,30 +2010,16 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, return 0; rcu_read_lock(); - xas_for_each(&xas, page, end) { - if (xas_retry(&xas, page)) - continue; + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } /* @@ -2081,7 +2093,7 @@ retry: EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_range_tag - find and return pages in given range matching @tag + * find_get_pages_range_tag - Find and return head pages matching @tag. * @mapping: the address_space to search * @index: the starting page index * @end: The final page index (inclusive) @@ -2089,8 +2101,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed * - * Like find_get_pages, except we only return pages which are tagged with - * @tag. We update @index to index the next page for the traversal. + * Like find_get_pages(), except we only return head pages which are tagged + * with @tag. @index is updated to the index immediately after the last + * page we return, ready for the next iteration. * * Return: the number of pages which were found. */ @@ -2106,9 +2119,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); - xas_for_each_marked(&xas, page, end, tag) { - if (xas_retry(&xas, page)) - continue; + while ((page = find_get_entry(&xas, end, tag))) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict @@ -2117,23 +2128,11 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - - pages[ret] = find_subpage(page, xas.xa_index); + pages[ret] = page; if (++ret == nr_pages) { - *index = xas.xa_index + 1; + *index = page->index + thp_nr_pages(page); goto out; } - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } /* @@ -2173,287 +2172,267 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) ra->ra_pages /= 4; } -static int lock_page_for_iocb(struct kiocb *iocb, struct page *page) +/* + * filemap_get_read_batch - Get a batch of pages for read + * + * Get a batch of pages which represent a contiguous range of bytes + * in the file. No tail pages will be returned. If @index is in the + * middle of a THP, the entire THP will be returned. The last page in + * the batch may have Readahead set or be not Uptodate so that the + * caller can take the appropriate action. + */ +static void filemap_get_read_batch(struct address_space *mapping, + pgoff_t index, pgoff_t max, struct pagevec *pvec) { - if (iocb->ki_flags & IOCB_WAITQ) - return lock_page_async(page, iocb->ki_waitq); - else if (iocb->ki_flags & IOCB_NOWAIT) - return trylock_page(page) ? 0 : -EAGAIN; - else - return lock_page_killable(page); + XA_STATE(xas, &mapping->i_pages, index); + struct page *head; + + rcu_read_lock(); + for (head = xas_load(&xas); head; head = xas_next(&xas)) { + if (xas_retry(&xas, head)) + continue; + if (xas.xa_index > max || xa_is_value(head)) + break; + if (!page_cache_get_speculative(head)) + goto retry; + + /* Has the page moved or been split? */ + if (unlikely(head != xas_reload(&xas))) + goto put_page; + + if (!pagevec_add(pvec, head)) + break; + if (!PageUptodate(head)) + break; + if (PageReadahead(head)) + break; + xas.xa_index = head->index + thp_nr_pages(head) - 1; + xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); + } + rcu_read_unlock(); } -static struct page * -generic_file_buffered_read_readpage(struct kiocb *iocb, - struct file *filp, - struct address_space *mapping, - struct page *page) +static int filemap_read_page(struct file *file, struct address_space *mapping, + struct page *page) { - struct file_ra_state *ra = &filp->f_ra; int error; - if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EAGAIN); - } - /* - * A previous I/O error may have been due to temporary - * failures, eg. multipath errors. - * PG_error will be set again if readpage fails. + * A previous I/O error may have been due to temporary failures, + * eg. multipath errors. PG_error will be set again if readpage + * fails. */ ClearPageError(page); /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(filp, page); + error = mapping->a_ops->readpage(file, page); + if (error) + return error; - if (unlikely(error)) { - put_page(page); - return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; - } + error = wait_on_page_locked_killable(page); + if (error) + return error; + if (PageUptodate(page)) + return 0; + if (!page->mapping) /* page truncated */ + return AOP_TRUNCATED_PAGE; + shrink_readahead_size_eio(&file->f_ra); + return -EIO; +} - if (!PageUptodate(page)) { - error = lock_page_for_iocb(iocb, page); - if (unlikely(error)) { - put_page(page); - return ERR_PTR(error); - } - if (!PageUptodate(page)) { - if (page->mapping == NULL) { - /* - * invalidate_mapping_pages got it - */ - unlock_page(page); - put_page(page); - return NULL; - } - unlock_page(page); - shrink_readahead_size_eio(ra); - put_page(page); - return ERR_PTR(-EIO); - } - unlock_page(page); +static bool filemap_range_uptodate(struct address_space *mapping, + loff_t pos, struct iov_iter *iter, struct page *page) +{ + int count; + + if (PageUptodate(page)) + return true; + /* pipes can't handle partially uptodate pages */ + if (iov_iter_is_pipe(iter)) + return false; + if (!mapping->a_ops->is_partially_uptodate) + return false; + if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page))) + return false; + + count = iter->count; + if (page_offset(page) > pos) { + count -= page_offset(page) - pos; + pos = 0; + } else { + pos -= page_offset(page); } - return page; + return mapping->a_ops->is_partially_uptodate(page, pos, count); } -static struct page * -generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb, - struct file *filp, - struct iov_iter *iter, - struct page *page, - loff_t pos, loff_t count) +static int filemap_update_page(struct kiocb *iocb, + struct address_space *mapping, struct iov_iter *iter, + struct page *page) { - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; int error; - /* - * See comment in do_read_cache_page on why - * wait_on_page_locked is used to avoid unnecessarily - * serialisations and why it's safe. - */ - if (iocb->ki_flags & IOCB_WAITQ) { - error = wait_on_page_locked_async(page, - iocb->ki_waitq); - } else { - error = wait_on_page_locked_killable(page); - } - if (unlikely(error)) { - put_page(page); - return ERR_PTR(error); + if (!trylock_page(page)) { + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) + return -EAGAIN; + if (!(iocb->ki_flags & IOCB_WAITQ)) { + put_and_wait_on_page_locked(page, TASK_KILLABLE); + return AOP_TRUNCATED_PAGE; + } + error = __lock_page_async(page, iocb->ki_waitq); + if (error) + return error; } - if (PageUptodate(page)) - return page; - if (inode->i_blkbits == PAGE_SHIFT || - !mapping->a_ops->is_partially_uptodate) - goto page_not_up_to_date; - /* pipes can't handle partially uptodate pages */ - if (unlikely(iov_iter_is_pipe(iter))) - goto page_not_up_to_date; - if (!trylock_page(page)) - goto page_not_up_to_date; - /* Did it get truncated before we got the lock? */ if (!page->mapping) - goto page_not_up_to_date_locked; - if (!mapping->a_ops->is_partially_uptodate(page, - pos & ~PAGE_MASK, count)) - goto page_not_up_to_date_locked; - unlock_page(page); - return page; - -page_not_up_to_date: - /* Get exclusive access to the page ... */ - error = lock_page_for_iocb(iocb, page); - if (unlikely(error)) { - put_page(page); - return ERR_PTR(error); - } + goto truncated; -page_not_up_to_date_locked: - /* Did it get truncated before we got the lock? */ - if (!page->mapping) { - unlock_page(page); - put_page(page); - return NULL; - } + error = 0; + if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page)) + goto unlock; - /* Did somebody else fill it already? */ - if (PageUptodate(page)) { - unlock_page(page); - return page; - } + error = -EAGAIN; + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) + goto unlock; - return generic_file_buffered_read_readpage(iocb, filp, mapping, page); + error = filemap_read_page(iocb->ki_filp, mapping, page); + if (error == AOP_TRUNCATED_PAGE) + put_page(page); + return error; +truncated: + unlock_page(page); + put_page(page); + return AOP_TRUNCATED_PAGE; +unlock: + unlock_page(page); + return error; } -static struct page * -generic_file_buffered_read_no_cached_page(struct kiocb *iocb, - struct iov_iter *iter) +static int filemap_create_page(struct file *file, + struct address_space *mapping, pgoff_t index, + struct pagevec *pvec) { - struct file *filp = iocb->ki_filp; - struct address_space *mapping = filp->f_mapping; - pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; struct page *page; int error; - if (iocb->ki_flags & IOCB_NOIO) - return ERR_PTR(-EAGAIN); - - /* - * Ok, it wasn't cached, so we need to create a new - * page.. - */ page = page_cache_alloc(mapping); if (!page) - return ERR_PTR(-ENOMEM); + return -ENOMEM; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (error) { - put_page(page); - return error != -EEXIST ? ERR_PTR(error) : NULL; - } + mapping_gfp_constraint(mapping, GFP_KERNEL)); + if (error == -EEXIST) + error = AOP_TRUNCATED_PAGE; + if (error) + goto error; + + error = filemap_read_page(file, mapping, page); + if (error) + goto error; - return generic_file_buffered_read_readpage(iocb, filp, mapping, page); + pagevec_add(pvec, page); + return 0; +error: + put_page(page); + return error; +} + +static int filemap_readahead(struct kiocb *iocb, struct file *file, + struct address_space *mapping, struct page *page, + pgoff_t last_index) +{ + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + page_cache_async_readahead(mapping, &file->f_ra, file, page, + page->index, last_index - page->index); + return 0; } -static int generic_file_buffered_read_get_pages(struct kiocb *iocb, - struct iov_iter *iter, - struct page **pages, - unsigned int nr) +static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, + struct pagevec *pvec) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; - pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; - int i, j, nr_got, err = 0; + pgoff_t last_index; + struct page *page; + int err = 0; - nr = min_t(unsigned long, last_index - index, nr); -find_page: + last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); +retry: if (fatal_signal_pending(current)) return -EINTR; - nr_got = find_get_pages_contig(mapping, index, nr, pages); - if (nr_got) - goto got_pages; - - if (iocb->ki_flags & IOCB_NOIO) - return -EAGAIN; - - page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - - nr_got = find_get_pages_contig(mapping, index, nr, pages); - if (nr_got) - goto got_pages; - - pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); - err = PTR_ERR_OR_ZERO(pages[0]); - if (!IS_ERR_OR_NULL(pages[0])) - nr_got = 1; -got_pages: - for (i = 0; i < nr_got; i++) { - struct page *page = pages[i]; - pgoff_t pg_index = index + i; - loff_t pg_pos = max(iocb->ki_pos, - (loff_t) pg_index << PAGE_SHIFT); - loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; - - if (PageReadahead(page)) { - if (iocb->ki_flags & IOCB_NOIO) { - for (j = i; j < nr_got; j++) - put_page(pages[j]); - nr_got = i; - err = -EAGAIN; - break; - } - page_cache_async_readahead(mapping, ra, filp, page, - pg_index, last_index - pg_index); - } - - if (!PageUptodate(page)) { - if ((iocb->ki_flags & IOCB_NOWAIT) || - ((iocb->ki_flags & IOCB_WAITQ) && i)) { - for (j = i; j < nr_got; j++) - put_page(pages[j]); - nr_got = i; - err = -EAGAIN; - break; - } + filemap_get_read_batch(mapping, index, last_index, pvec); + if (!pagevec_count(pvec)) { + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + page_cache_sync_readahead(mapping, ra, filp, index, + last_index - index); + filemap_get_read_batch(mapping, index, last_index, pvec); + } + if (!pagevec_count(pvec)) { + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + err = filemap_create_page(filp, mapping, + iocb->ki_pos >> PAGE_SHIFT, pvec); + if (err == AOP_TRUNCATED_PAGE) + goto retry; + return err; + } - page = generic_file_buffered_read_pagenotuptodate(iocb, - filp, iter, page, pg_pos, pg_count); - if (IS_ERR_OR_NULL(page)) { - for (j = i + 1; j < nr_got; j++) - put_page(pages[j]); - nr_got = i; - err = PTR_ERR_OR_ZERO(page); - break; - } - } + page = pvec->pages[pagevec_count(pvec) - 1]; + if (PageReadahead(page)) { + err = filemap_readahead(iocb, filp, mapping, page, last_index); + if (err) + goto err; + } + if (!PageUptodate(page)) { + if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1) + iocb->ki_flags |= IOCB_NOWAIT; + err = filemap_update_page(iocb, mapping, iter, page); + if (err) + goto err; } - if (likely(nr_got)) - return nr_got; - if (err) - return err; - /* - * No pages and no error means we raced and should retry: - */ - goto find_page; + return 0; +err: + if (err < 0) + put_page(page); + if (likely(--pvec->nr)) + return 0; + if (err == AOP_TRUNCATED_PAGE) + goto retry; + return err; } /** - * generic_file_buffered_read - generic file read routine - * @iocb: the iocb to read - * @iter: data destination - * @written: already copied - * - * This is a generic file read routine, and uses the - * mapping->a_ops->readpage() function for the actual low-level stuff. + * filemap_read - Read data from the page cache. + * @iocb: The iocb to read. + * @iter: Destination for the data. + * @already_read: Number of bytes already read by the caller. * - * This is really ugly. But the goto's actually try to clarify some - * of the logic when it comes to error handling etc. + * Copies data from the page cache. If the data is not currently present, + * uses the readahead and readpage address_space operations to fetch it. * - * Return: - * * total number of bytes copied, including those the were already @written - * * negative error code if nothing was copied + * Return: Total number of bytes copied, including those already read by + * the caller. If an error happens before any bytes are copied, returns + * a negative error number. */ -ssize_t generic_file_buffered_read(struct kiocb *iocb, - struct iov_iter *iter, ssize_t written) +ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, + ssize_t already_read) { struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL; - unsigned int nr_pages = min_t(unsigned int, 512, - ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - - (iocb->ki_pos >> PAGE_SHIFT)); - int i, pg_nr, error = 0; + struct pagevec pvec; + int i, error = 0; bool writably_mapped; loff_t isize, end_offset; @@ -2463,14 +2442,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - - if (nr_pages > ARRAY_SIZE(pages_onstack)) - pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); - - if (!pages) { - pages = pages_onstack; - nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack)); - } + pagevec_init(&pvec); do { cond_resched(); @@ -2480,16 +2452,12 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ - if ((iocb->ki_flags & IOCB_WAITQ) && written) + if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; - i = 0; - pg_nr = generic_file_buffered_read_get_pages(iocb, iter, - pages, nr_pages); - if (pg_nr < 0) { - error = pg_nr; + error = filemap_get_pages(iocb, iter, &pvec); + if (error < 0) break; - } /* * i_size must be checked after we know the pages are Uptodate. @@ -2502,13 +2470,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) goto put_pages; - end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > - (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) - put_page(pages[--pg_nr]); - /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: @@ -2521,27 +2484,35 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, */ if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT) - mark_page_accessed(pages[0]); - for (i = 1; i < pg_nr; i++) - mark_page_accessed(pages[i]); + mark_page_accessed(pvec.pages[0]); - for (i = 0; i < pg_nr; i++) { - unsigned int offset = iocb->ki_pos & ~PAGE_MASK; - unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, - PAGE_SIZE - offset); - unsigned int copied; + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + size_t page_size = thp_size(page); + size_t offset = iocb->ki_pos & (page_size - 1); + size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, + page_size - offset); + size_t copied; + if (end_offset < page_offset(page)) + break; + if (i > 0) + mark_page_accessed(page); /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (writably_mapped) - flush_dcache_page(pages[i]); + if (writably_mapped) { + int j; + + for (j = 0; j < thp_nr_pages(page); j++) + flush_dcache_page(page + j); + } - copied = copy_page_to_iter(pages[i], offset, bytes, iter); + copied = copy_page_to_iter(page, offset, bytes, iter); - written += copied; + already_read += copied; iocb->ki_pos += copied; ra->prev_pos = iocb->ki_pos; @@ -2551,18 +2522,16 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, } } put_pages: - for (i = 0; i < pg_nr; i++) - put_page(pages[i]); + for (i = 0; i < pagevec_count(&pvec); i++) + put_page(pvec.pages[i]); + pagevec_reinit(&pvec); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); - if (pages != pages_onstack) - kfree(pages); - - return written ? written : error; + return already_read ? already_read : error; } -EXPORT_SYMBOL_GPL(generic_file_buffered_read); +EXPORT_SYMBOL_GPL(filemap_read); /** * generic_file_read_iter - generic filesystem read routine @@ -2592,7 +2561,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t retval = 0; if (!count) - goto out; /* skip atime */ + return 0; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct file *file = iocb->ki_filp; @@ -2610,7 +2579,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos, iocb->ki_pos + count - 1); if (retval < 0) - goto out; + return retval; } file_accessed(file); @@ -2620,7 +2589,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += retval; count -= retval; } - iov_iter_revert(iter, count - iov_iter_count(iter)); + if (retval != -EIOCBQUEUED) + iov_iter_revert(iter, count - iov_iter_count(iter)); /* * Btrfs can have a short DIO read if we encounter @@ -2633,15 +2603,116 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) */ if (retval < 0 || !count || iocb->ki_pos >= size || IS_DAX(inode)) - goto out; + return retval; } - retval = generic_file_buffered_read(iocb, iter, retval); -out: - return retval; + return filemap_read(iocb, iter, retval); } EXPORT_SYMBOL(generic_file_read_iter); +static inline loff_t page_seek_hole_data(struct xa_state *xas, + struct address_space *mapping, struct page *page, + loff_t start, loff_t end, bool seek_data) +{ + const struct address_space_operations *ops = mapping->a_ops; + size_t offset, bsz = i_blocksize(mapping->host); + + if (xa_is_value(page) || PageUptodate(page)) + return seek_data ? start : end; + if (!ops->is_partially_uptodate) + return seek_data ? end : start; + + xas_pause(xas); + rcu_read_unlock(); + lock_page(page); + if (unlikely(page->mapping != mapping)) + goto unlock; + + offset = offset_in_thp(page, start) & ~(bsz - 1); + + do { + if (ops->is_partially_uptodate(page, offset, bsz) == seek_data) + break; + start = (start + bsz) & ~(bsz - 1); + offset += bsz; + } while (offset < thp_size(page)); +unlock: + unlock_page(page); + rcu_read_lock(); + return start; +} + +static inline +unsigned int seek_page_size(struct xa_state *xas, struct page *page) +{ + if (xa_is_value(page)) + return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); + return thp_size(page); +} + +/** + * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * @mapping: Address space to search. + * @start: First byte to consider. + * @end: Limit of search (exclusive). + * @whence: Either SEEK_HOLE or SEEK_DATA. + * + * If the page cache knows which blocks contain holes and which blocks + * contain data, your filesystem can use this function to implement + * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are + * entirely memory-based such as tmpfs, and filesystems which support + * unwritten extents. + * + * Return: The requested offset on successs, or -ENXIO if @whence specifies + * SEEK_DATA and there is no data after @start. There is an implicit hole + * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start + * and @end contain data. + */ +loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, + loff_t end, int whence) +{ + XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); + pgoff_t max = (end - 1) / PAGE_SIZE; + bool seek_data = (whence == SEEK_DATA); + struct page *page; + + if (end <= start) + return -ENXIO; + + rcu_read_lock(); + while ((page = find_get_entry(&xas, max, XA_PRESENT))) { + loff_t pos = xas.xa_index * PAGE_SIZE; + + if (start < pos) { + if (!seek_data) + goto unlock; + start = pos; + } + + pos += seek_page_size(&xas, page); + start = page_seek_hole_data(&xas, mapping, page, start, pos, + seek_data); + if (start < pos) + goto unlock; + if (!xa_is_value(page)) + put_page(page); + } + rcu_read_unlock(); + + if (seek_data) + return -ENXIO; + goto out; + +unlock: + rcu_read_unlock(); + if (!xa_is_value(page)) + put_page(page); +out: + if (start > end) + return end; + return start; +} + #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* @@ -3431,7 +3502,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) } iocb->ki_pos = pos; } - iov_iter_revert(from, write_len - iov_iter_count(from)); + if (written != -EIOCBQUEUED) + iov_iter_revert(from, write_len - iov_iter_count(from)); out: return written; } |