diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 1004 |
1 files changed, 481 insertions, 523 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 3baf03c0f608..2fd9b2f24025 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -121,99 +121,97 @@ */ static void page_cache_delete(struct address_space *mapping, - struct page *page, void *shadow) + struct folio *folio, void *shadow) { - XA_STATE(xas, &mapping->i_pages, page->index); - unsigned int nr = 1; + XA_STATE(xas, &mapping->i_pages, folio->index); + long nr = 1; mapping_set_update(&xas, mapping); /* hugetlb pages are represented by a single entry in the xarray */ - if (!PageHuge(page)) { - xas_set_order(&xas, page->index, compound_order(page)); - nr = compound_nr(page); + if (!folio_test_hugetlb(folio)) { + xas_set_order(&xas, folio->index, folio_order(folio)); + nr = folio_nr_pages(folio); } - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(nr != 1 && shadow, page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); xas_store(&xas, shadow); xas_init_marks(&xas); - page->mapping = NULL; + folio->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } -static void unaccount_page_cache_page(struct address_space *mapping, - struct page *page) +static void filemap_unaccount_folio(struct address_space *mapping, + struct folio *folio) { - int nr; + long nr; /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave * stale data around in the cleancache once our page is gone */ - if (PageUptodate(page) && PageMappedToDisk(page)) - cleancache_put_page(page); + if (folio_test_uptodate(folio) && folio_test_mappedtodisk(folio)) + cleancache_put_page(&folio->page); else - cleancache_invalidate_page(mapping, page); + cleancache_invalidate_page(mapping, &folio->page); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(page_mapped(page), page); - if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { + VM_BUG_ON_FOLIO(folio_mapped(folio), folio); + if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { int mapcount; pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", - current->comm, page_to_pfn(page)); - dump_page(page, "still mapped when deleted"); + current->comm, folio_pfn(folio)); + dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - mapcount = page_mapcount(page); + mapcount = page_mapcount(&folio->page); if (mapping_exiting(mapping) && - page_count(page) >= mapcount + 2) { + folio_ref_count(folio) >= mapcount + 2) { /* * All vmas have already been torn down, so it's - * a good bet that actually the page is unmapped, + * a good bet that actually the folio is unmapped, * and we'd prefer not to leak it: if we're wrong, * some other bad page check should catch it later. */ - page_mapcount_reset(page); - page_ref_sub(page, mapcount); + page_mapcount_reset(&folio->page); + folio_ref_sub(folio, mapcount); } } - /* hugetlb pages do not participate in page cache accounting. */ - if (PageHuge(page)) + /* hugetlb folios do not participate in page cache accounting. */ + if (folio_test_hugetlb(folio)) return; - nr = thp_nr_pages(page); + nr = folio_nr_pages(folio); - __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); - if (PageSwapBacked(page)) { - __mod_lruvec_page_state(page, NR_SHMEM, -nr); - if (PageTransHuge(page)) - __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr); - } else if (PageTransHuge(page)) { - __mod_lruvec_page_state(page, NR_FILE_THPS, -nr); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + } else if (folio_test_pmd_mappable(folio)) { + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } /* - * At this point page must be either written or cleaned by - * truncate. Dirty page here signals a bug and loss of + * At this point folio must be either written or cleaned by + * truncate. Dirty folio here signals a bug and loss of * unwritten data. * - * This fixes dirty accounting after removing the page entirely - * but leaves PageDirty set: it has no effect for truncated - * page and anyway will be cleared before returning page into + * This fixes dirty accounting after removing the folio entirely + * but leaves the dirty flag set: it has no effect for truncated + * folio and anyway will be cleared before returning folio to * buddy allocator. */ - if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); + if (WARN_ON_ONCE(folio_test_dirty(folio))) + folio_account_cleaned(folio, mapping, + inode_to_wb(mapping->host)); } /* @@ -221,87 +219,83 @@ static void unaccount_page_cache_page(struct address_space *mapping, * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the i_pages lock. */ -void __delete_from_page_cache(struct page *page, void *shadow) +void __filemap_remove_folio(struct folio *folio, void *shadow) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; - trace_mm_filemap_delete_from_page_cache(page); - - unaccount_page_cache_page(mapping, page); - page_cache_delete(mapping, page, shadow); + trace_mm_filemap_delete_from_page_cache(folio); + filemap_unaccount_folio(mapping, folio); + page_cache_delete(mapping, folio, shadow); } -static void page_cache_free_page(struct address_space *mapping, - struct page *page) +void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*freepage)(struct page *); freepage = mapping->a_ops->freepage; if (freepage) - freepage(page); + freepage(&folio->page); - if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, thp_nr_pages(page)); - VM_BUG_ON_PAGE(page_count(page) <= 0, page); + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) { + folio_ref_sub(folio, folio_nr_pages(folio)); + VM_BUG_ON_FOLIO(folio_ref_count(folio) <= 0, folio); } else { - put_page(page); + folio_put(folio); } } /** - * delete_from_page_cache - delete page from page cache - * @page: the page which the kernel is trying to remove from page cache + * filemap_remove_folio - Remove folio from page cache. + * @folio: The folio. * - * This must be called only on pages that have been verified to be in the page - * cache and locked. It will never put the page into the free list, the caller - * has a reference on the page. + * This must be called only on folios that are locked and have been + * verified to be in the page cache. It will never put the folio into + * the free list because the caller has a reference on the page. */ -void delete_from_page_cache(struct page *page) +void filemap_remove_folio(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio->mapping; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - __delete_from_page_cache(page, NULL); + __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - page_cache_free_page(mapping, page); + filemap_free_folio(mapping, folio); } -EXPORT_SYMBOL(delete_from_page_cache); /* - * page_cache_delete_batch - delete several pages from page cache - * @mapping: the mapping to which pages belong - * @pvec: pagevec with pages to delete + * page_cache_delete_batch - delete several folios from page cache + * @mapping: the mapping to which folios belong + * @fbatch: batch of folios to delete * - * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index - * and is optimised for it to be dense. - * It tolerates holes in @pvec (mapping entries at those indices are not - * modified). The function expects only THP head pages to be present in the - * @pvec. + * The function walks over mapping->i_pages and removes folios passed in + * @fbatch from the mapping. The function expects @fbatch to be sorted + * by page index and is optimised for it to be dense. + * It tolerates holes in @fbatch (mapping entries at those indices are not + * modified). * * The function expects the i_pages lock to be held. */ static void page_cache_delete_batch(struct address_space *mapping, - struct pagevec *pvec) + struct folio_batch *fbatch) { - XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); - int total_pages = 0; + XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); + long total_pages = 0; int i = 0; - struct page *page; + struct folio *folio; mapping_set_update(&xas, mapping); - xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec)) + xas_for_each(&xas, folio, ULONG_MAX) { + if (i >= folio_batch_count(fbatch)) break; /* A swap/dax/shadow entry got inserted? Skip it. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; /* * A page got inserted in our range? Skip it. We have our @@ -310,54 +304,48 @@ static void page_cache_delete_batch(struct address_space *mapping, * means our page has been removed, which shouldn't be * possible because we're holding the PageLock. */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, - page); + if (folio != fbatch->folios[i]) { + VM_BUG_ON_FOLIO(folio->index > + fbatch->folios[i]->index, folio); continue; } - WARN_ON_ONCE(!PageLocked(page)); + WARN_ON_ONCE(!folio_test_locked(folio)); - if (page->index == xas.xa_index) - page->mapping = NULL; - /* Leave page->index set: truncation lookup relies on it */ + folio->mapping = NULL; + /* Leave folio->index set: truncation lookup relies on it */ - /* - * Move to the next page in the vector if this is a regular - * page or the index is of the last sub-page of this compound - * page. - */ - if (page->index + compound_nr(page) - 1 == xas.xa_index) - i++; + i++; xas_store(&xas, NULL); - total_pages++; + total_pages += folio_nr_pages(folio); } mapping->nrpages -= total_pages; } void delete_from_page_cache_batch(struct address_space *mapping, - struct pagevec *pvec) + struct folio_batch *fbatch) { int i; - if (!pagevec_count(pvec)) + if (!folio_batch_count(fbatch)) return; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - for (i = 0; i < pagevec_count(pvec); i++) { - trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; - unaccount_page_cache_page(mapping, pvec->pages[i]); + trace_mm_filemap_delete_from_page_cache(folio); + filemap_unaccount_folio(mapping, folio); } - page_cache_delete_batch(mapping, pvec); + page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - for (i = 0; i < pagevec_count(pvec); i++) - page_cache_free_page(mapping, pvec->pages[i]); + for (i = 0; i < folio_batch_count(fbatch); i++) + filemap_free_folio(mapping, fbatch->folios[i]); } int filemap_check_errors(struct address_space *mapping) @@ -933,7 +921,7 @@ unlock: goto error; } - trace_mm_filemap_add_to_page_cache(&folio->page); + trace_mm_filemap_add_to_page_cache(folio); return 0; error: folio->mapping = NULL; @@ -1233,10 +1221,10 @@ enum behavior { * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like - * wait_on_page_writeback() waiting on PG_writeback. + * folio_wait_writeback() waiting on PG_writeback. */ DROP, /* Drop ref to page before wait, no check when woken, - * like put_and_wait_on_page_locked() on PG_locked. + * like folio_put_wait_locked() on PG_locked. */ }; @@ -1413,22 +1401,21 @@ int folio_wait_bit_killable(struct folio *folio, int bit_nr) EXPORT_SYMBOL(folio_wait_bit_killable); /** - * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked - * @page: The page to wait for. + * folio_put_wait_locked - Drop a reference and wait for it to be unlocked + * @folio: The folio to wait for. * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * - * The caller should hold a reference on @page. They expect the page to + * The caller should hold a reference on @folio. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration - * (for example) by holding the reference while waiting for the page to + * (for example) by holding the reference while waiting for the folio to * come unlocked. After this function returns, the caller should not - * dereference @page. + * dereference @folio. * - * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal. + * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ -int put_and_wait_on_page_locked(struct page *page, int state) +int folio_put_wait_locked(struct folio *folio, int state) { - return folio_wait_bit_common(page_folio(page), PG_locked, state, - DROP); + return folio_wait_bit_common(folio, PG_locked, state, DROP); } /** @@ -1953,37 +1940,36 @@ no_page: } EXPORT_SYMBOL(__filemap_get_folio); -static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, +static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) { - struct page *page; + struct folio *folio; retry: if (mark == XA_PRESENT) - page = xas_find(xas, max); + folio = xas_find(xas, max); else - page = xas_find_marked(xas, max, mark); + folio = xas_find_marked(xas, max, mark); - if (xas_retry(xas, page)) + if (xas_retry(xas, folio)) goto retry; /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it * without attempting to raise page count. */ - if (!page || xa_is_value(page)) - return page; + if (!folio || xa_is_value(folio)) + return folio; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) goto reset; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(xas))) { - put_page(page); + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); goto reset; } - return page; + return folio; reset: xas_reset(xas); goto retry; @@ -1994,56 +1980,36 @@ reset: * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). - * @pvec: Where the resulting entries are placed. + * @fbatch: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * * find_get_entries() will search for and return a batch of entries in - * the mapping. The entries are placed in @pvec. find_get_entries() - * takes a reference on any actual pages it returns. + * the mapping. The entries are placed in @fbatch. find_get_entries() + * takes a reference on any actual folios it returns. * - * The search returns a group of mapping-contiguous page cache entries - * with ascending indexes. There may be holes in the indices due to - * not-present pages. + * The entries have ascending indexes. The indices may not be consecutive + * due to not-present entries or large folios. * - * Any shadow entries of evicted pages, or swap entries from + * Any shadow entries of evicted folios, or swap entries from * shmem/tmpfs, are included in the returned array. * - * If it finds a Transparent Huge Page, head or tail, find_get_entries() - * stops at that page: the caller is likely to have a better way to handle - * the compound page as a whole, and then skip its extent, than repeatedly - * calling find_get_entries() to return all its tails. - * - * Return: the number of pages and shadow entries which were found. + * Return: The number of entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices) + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; - unsigned int ret = 0; - unsigned nr_entries = PAGEVEC_SIZE; + struct folio *folio; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { - /* - * Terminate early on finding a THP, to allow the caller to - * handle it all at once; but continue if this is hugetlbfs. - */ - if (!xa_is_value(page) && PageTransHuge(page) && - !PageHuge(page)) { - page = find_subpage(page, xas.xa_index); - nr_entries = ret + 1; - } - - indices[ret] = xas.xa_index; - pvec->pages[ret] = page; - if (++ret == nr_entries) + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + indices[fbatch->nr] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) break; } rcu_read_unlock(); - pvec->nr = ret; - return ret; + return folio_batch_count(fbatch); } /** @@ -2051,63 +2017,64 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, * @mapping: The address_space to search. * @start: The starting page cache index. * @end: The final page index (inclusive). - * @pvec: Where the resulting entries are placed. - * @indices: The cache indices of the entries in @pvec. + * @fbatch: Where the resulting entries are placed. + * @indices: The cache indices of the entries in @fbatch. * * find_lock_entries() will return a batch of entries from @mapping. - * Swap, shadow and DAX entries are included. Pages are returned - * locked and with an incremented refcount. Pages which are locked by - * somebody else or under writeback are skipped. Only the head page of - * a THP is returned. Pages which are partially outside the range are - * not returned. + * Swap, shadow and DAX entries are included. Folios are returned + * locked and with an incremented refcount. Folios which are locked + * by somebody else or under writeback are skipped. Folios which are + * partially outside the range are not returned. * * The entries have ascending indexes. The indices may not be consecutive - * due to not-present entries, THP pages, pages which could not be locked - * or pages under writeback. + * due to not-present entries, large folios, folios which could not be + * locked or folios under writeback. * * Return: The number of entries which were found. */ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices) + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { - if (!xa_is_value(page)) { - if (page->index < start) + while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + if (!xa_is_value(folio)) { + if (folio->index < start) goto put; - if (page->index + thp_nr_pages(page) - 1 > end) + if (folio->index + folio_nr_pages(folio) - 1 > end) goto put; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto put; - if (page->mapping != mapping || PageWriteback(page)) + if (folio->mapping != mapping || + folio_test_writeback(folio)) goto unlock; - VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index), - page); + VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), + folio); } - indices[pvec->nr] = xas.xa_index; - if (!pagevec_add(pvec, page)) + indices[fbatch->nr] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) break; - goto next; + continue; unlock: - unlock_page(page); + folio_unlock(folio); put: - put_page(page); -next: - if (!xa_is_value(page) && PageTransHuge(page)) { - unsigned int nr_pages = thp_nr_pages(page); - - /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */ - xas_set(&xas, page->index + nr_pages); - if (xas.xa_index < nr_pages) - break; - } + folio_put(folio); } rcu_read_unlock(); - return pagevec_count(pvec); + return folio_batch_count(fbatch); +} + +static inline +bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) +{ + if (!folio_test_large(folio) || folio_test_hugetlb(folio)) + return false; + if (index >= max) + return false; + return index < folio->index + folio_nr_pages(folio) - 1; } /** @@ -2136,23 +2103,29 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, struct page **pages) { XA_STATE(xas, &mapping->i_pages, *start); - struct page *page; + struct folio *folio; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { + while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { /* Skip over shadow, swap and DAX entries */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - pages[ret] = find_subpage(page, xas.xa_index); +again: + pages[ret] = folio_file_page(folio, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } + if (folio_more_pages(folio, xas.xa_index, end)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } } /* @@ -2187,36 +2160,41 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { XA_STATE(xas, &mapping->i_pages, index); - struct page *page; + struct folio *folio; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - for (page = xas_load(&xas); page; page = xas_next(&xas)) { - if (xas_retry(&xas, page)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; /* * If the entry has been swapped out, we can stop looking. * No current caller is looking for DAX entries. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) break; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) goto retry; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) + if (unlikely(folio != xas_reload(&xas))) goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); +again: + pages[ret] = folio_file_page(folio, xas.xa_index); if (++ret == nr_pages) break; + if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } continue; put_page: - put_page(page); + folio_put(folio); retry: xas_reset(&xas); } @@ -2245,25 +2223,25 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, struct page **pages) { XA_STATE(xas, &mapping->i_pages, *index); - struct page *page; + struct folio *folio; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, tag))) { + while ((folio = find_get_entry(&xas, end, tag))) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict * a page we saw tagged. Skip over it. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - pages[ret] = page; + pages[ret] = &folio->page; if (++ret == nr_pages) { - *index = page->index + thp_nr_pages(page); + *index = folio->index + folio_nr_pages(folio); goto out; } } @@ -2306,52 +2284,50 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) } /* - * filemap_get_read_batch - Get a batch of pages for read + * filemap_get_read_batch - Get a batch of folios for read * - * Get a batch of pages which represent a contiguous range of bytes - * in the file. No tail pages will be returned. If @index is in the - * middle of a THP, the entire THP will be returned. The last page in - * the batch may have Readahead set or be not Uptodate so that the - * caller can take the appropriate action. + * Get a batch of folios which represent a contiguous range of bytes in + * the file. No exceptional entries will be returned. If @index is in + * the middle of a folio, the entire folio will be returned. The last + * folio in the batch may have the readahead flag set or the uptodate flag + * clear so that the caller can take the appropriate action. */ static void filemap_get_read_batch(struct address_space *mapping, - pgoff_t index, pgoff_t max, struct pagevec *pvec) + pgoff_t index, pgoff_t max, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, index); - struct page *head; + struct folio *folio; rcu_read_lock(); - for (head = xas_load(&xas); head; head = xas_next(&xas)) { - if (xas_retry(&xas, head)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; - if (xas.xa_index > max || xa_is_value(head)) + if (xas.xa_index > max || xa_is_value(folio)) break; - if (!page_cache_get_speculative(head)) + if (!folio_try_get_rcu(folio)) goto retry; - /* Has the page moved or been split? */ - if (unlikely(head != xas_reload(&xas))) - goto put_page; + if (unlikely(folio != xas_reload(&xas))) + goto put_folio; - if (!pagevec_add(pvec, head)) + if (!folio_batch_add(fbatch, folio)) break; - if (!PageUptodate(head)) + if (!folio_test_uptodate(folio)) break; - if (PageReadahead(head)) + if (folio_test_readahead(folio)) break; - xas.xa_index = head->index + thp_nr_pages(head) - 1; - xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK; + xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1); continue; -put_page: - put_page(head); +put_folio: + folio_put(folio); retry: xas_reset(&xas); } rcu_read_unlock(); } -static int filemap_read_page(struct file *file, struct address_space *mapping, - struct page *page) +static int filemap_read_folio(struct file *file, struct address_space *mapping, + struct folio *folio) { int error; @@ -2360,52 +2336,51 @@ static int filemap_read_page(struct file *file, struct address_space *mapping, * eg. multipath errors. PG_error will be set again if readpage * fails. */ - ClearPageError(page); + folio_clear_error(folio); /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(file, page); + error = mapping->a_ops->readpage(file, &folio->page); if (error) return error; - error = wait_on_page_locked_killable(page); + error = folio_wait_locked_killable(folio); if (error) return error; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; shrink_readahead_size_eio(&file->f_ra); return -EIO; } static bool filemap_range_uptodate(struct address_space *mapping, - loff_t pos, struct iov_iter *iter, struct page *page) + loff_t pos, struct iov_iter *iter, struct folio *folio) { int count; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ if (iov_iter_is_pipe(iter)) return false; if (!mapping->a_ops->is_partially_uptodate) return false; - if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page))) + if (mapping->host->i_blkbits >= folio_shift(folio)) return false; count = iter->count; - if (page_offset(page) > pos) { - count -= page_offset(page) - pos; + if (folio_pos(folio) > pos) { + count -= folio_pos(folio) - pos; pos = 0; } else { - pos -= page_offset(page); + pos -= folio_pos(folio); } - return mapping->a_ops->is_partially_uptodate(page, pos, count); + return mapping->a_ops->is_partially_uptodate(&folio->page, pos, count); } static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, struct iov_iter *iter, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); int error; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -2421,7 +2396,11 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); - put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE); + /* + * This is where we usually end up waiting for a + * previously submitted readahead to finish. + */ + folio_put_wait_locked(folio, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __folio_lock_async(folio, iocb->ki_waitq); @@ -2434,14 +2413,14 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock; error = 0; - if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page)) + if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; - error = filemap_read_page(iocb->ki_filp, mapping, &folio->page); + error = filemap_read_folio(iocb->ki_filp, mapping, folio); goto unlock_mapping; unlock: folio_unlock(folio); @@ -2452,70 +2431,72 @@ unlock_mapping: return error; } -static int filemap_create_page(struct file *file, +static int filemap_create_folio(struct file *file, struct address_space *mapping, pgoff_t index, - struct pagevec *pvec) + struct folio_batch *fbatch) { - struct page *page; + struct folio *folio; int error; - page = page_cache_alloc(mapping); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); + if (!folio) return -ENOMEM; /* - * Protect against truncate / hole punch. Grabbing invalidate_lock here - * assures we cannot instantiate and bring uptodate new pagecache pages - * after evicting page cache during truncate and before actually - * freeing blocks. Note that we could release invalidate_lock after - * inserting the page into page cache as the locked page would then be - * enough to synchronize with hole punching. But there are code paths - * such as filemap_update_page() filling in partially uptodate pages or - * ->readpages() that need to hold invalidate_lock while mapping blocks - * for IO so let's hold the lock here as well to keep locking rules - * simple. + * Protect against truncate / hole punch. Grabbing invalidate_lock + * here assures we cannot instantiate and bring uptodate new + * pagecache folios after evicting page cache during truncate + * and before actually freeing blocks. Note that we could + * release invalidate_lock after inserting the folio into + * the page cache as the locked folio would then be enough to + * synchronize with hole punching. But there are code paths + * such as filemap_update_page() filling in partially uptodate + * pages or ->readpages() that need to hold invalidate_lock + * while mapping blocks for IO so let's hold the lock here as + * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - error = add_to_page_cache_lru(page, mapping, index, + error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) error = AOP_TRUNCATED_PAGE; if (error) goto error; - error = filemap_read_page(file, mapping, page); + error = filemap_read_folio(file, mapping, folio); if (error) goto error; filemap_invalidate_unlock_shared(mapping); - pagevec_add(pvec, page); + folio_batch_add(fbatch, folio); return 0; error: filemap_invalidate_unlock_shared(mapping); - put_page(page); + folio_put(folio); return error; } static int filemap_readahead(struct kiocb *iocb, struct file *file, - struct address_space *mapping, struct page *page, + struct address_space *mapping, struct folio *folio, pgoff_t last_index) { + DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; - page_cache_async_readahead(mapping, &file->f_ra, file, page, - page->index, last_index - page->index); + page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, - struct pagevec *pvec) + struct folio_batch *fbatch) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; - struct page *page; + struct folio *folio; int err = 0; last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); @@ -2523,34 +2504,35 @@ retry: if (fatal_signal_pending(current)) return -EINTR; - filemap_get_read_batch(mapping, index, last_index, pvec); - if (!pagevec_count(pvec)) { + filemap_get_read_batch(mapping, index, last_index, fbatch); + if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - filemap_get_read_batch(mapping, index, last_index, pvec); + filemap_get_read_batch(mapping, index, last_index, fbatch); } - if (!pagevec_count(pvec)) { + if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - err = filemap_create_page(filp, mapping, - iocb->ki_pos >> PAGE_SHIFT, pvec); + err = filemap_create_folio(filp, mapping, + iocb->ki_pos >> PAGE_SHIFT, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } - page = pvec->pages[pagevec_count(pvec) - 1]; - if (PageReadahead(page)) { - err = filemap_readahead(iocb, filp, mapping, page, last_index); + folio = fbatch->folios[folio_batch_count(fbatch) - 1]; + if (folio_test_readahead(folio)) { + err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } - if (!PageUptodate(page)) { - if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1) + if (!folio_test_uptodate(folio)) { + if ((iocb->ki_flags & IOCB_WAITQ) && + folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; - err = filemap_update_page(iocb, mapping, iter, page); + err = filemap_update_page(iocb, mapping, iter, folio); if (err) goto err; } @@ -2558,8 +2540,8 @@ retry: return 0; err: if (err < 0) - put_page(page); - if (likely(--pvec->nr)) + folio_put(folio); + if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; @@ -2586,7 +2568,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct pagevec pvec; + struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; @@ -2597,7 +2579,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - pagevec_init(&pvec); + folio_batch_init(&fbatch); do { cond_resched(); @@ -2613,7 +2595,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter, &pvec); + error = filemap_get_pages(iocb, iter, &fbatch); if (error < 0) break; @@ -2627,7 +2609,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) - goto put_pages; + goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* @@ -2642,33 +2624,29 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, */ if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT) - mark_page_accessed(pvec.pages[0]); + folio_mark_accessed(fbatch.folios[0]); - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - size_t page_size = thp_size(page); - size_t offset = iocb->ki_pos & (page_size - 1); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + size_t fsize = folio_size(folio); + size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, - page_size - offset); + fsize - offset); size_t copied; - if (end_offset < page_offset(page)) + if (end_offset < folio_pos(folio)) break; if (i > 0) - mark_page_accessed(page); + folio_mark_accessed(folio); /* - * If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. + * If users can be writing to this folio using arbitrary + * virtual addresses, take care of potential aliasing + * before reading the folio on the kernel side. */ - if (writably_mapped) { - int j; - - for (j = 0; j < thp_nr_pages(page); j++) - flush_dcache_page(page + j); - } + if (writably_mapped) + flush_dcache_folio(folio); - copied = copy_page_to_iter(page, offset, bytes, iter); + copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; @@ -2679,10 +2657,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, break; } } -put_pages: - for (i = 0; i < pagevec_count(&pvec); i++) - put_page(pvec.pages[i]); - pagevec_reinit(&pvec); +put_folios: + for (i = 0; i < folio_batch_count(&fbatch); i++) + folio_put(fbatch.folios[i]); + folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); @@ -2767,44 +2745,44 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); -static inline loff_t page_seek_hole_data(struct xa_state *xas, - struct address_space *mapping, struct page *page, +static inline loff_t folio_seek_hole_data(struct xa_state *xas, + struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) { const struct address_space_operations *ops = mapping->a_ops; size_t offset, bsz = i_blocksize(mapping->host); - if (xa_is_value(page) || PageUptodate(page)) + if (xa_is_value(folio) || folio_test_uptodate(folio)) return seek_data ? start : end; if (!ops->is_partially_uptodate) return seek_data ? end : start; xas_pause(xas); rcu_read_unlock(); - lock_page(page); - if (unlikely(page->mapping != mapping)) + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) goto unlock; - offset = offset_in_thp(page, start) & ~(bsz - 1); + offset = offset_in_folio(folio, start) & ~(bsz - 1); do { - if (ops->is_partially_uptodate(page, offset, bsz) == seek_data) + if (ops->is_partially_uptodate(&folio->page, offset, bsz) == + seek_data) break; start = (start + bsz) & ~(bsz - 1); offset += bsz; - } while (offset < thp_size(page)); + } while (offset < folio_size(folio)); unlock: - unlock_page(page); + folio_unlock(folio); rcu_read_lock(); return start; } -static inline -unsigned int seek_page_size(struct xa_state *xas, struct page *page) +static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { - if (xa_is_value(page)) + if (xa_is_value(folio)) return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); - return thp_size(page); + return folio_size(folio); } /** @@ -2831,15 +2809,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); - struct page *page; + struct folio *folio; if (end <= start) return -ENXIO; rcu_read_lock(); - while ((page = find_get_entry(&xas, max, XA_PRESENT))) { + while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; - unsigned int seek_size; + size_t seek_size; if (start < pos) { if (!seek_data) @@ -2847,9 +2825,9 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, start = pos; } - seek_size = seek_page_size(&xas, page); - pos = round_up(pos + 1, seek_size); - start = page_seek_hole_data(&xas, mapping, page, start, pos, + seek_size = seek_folio_size(&xas, folio); + pos = round_up((u64)pos + 1, seek_size); + start = folio_seek_hole_data(&xas, mapping, folio, start, pos, seek_data); if (start < pos) goto unlock; @@ -2857,15 +2835,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, break; if (seek_size > PAGE_SIZE) xas_set(&xas, pos >> PAGE_SHIFT); - if (!xa_is_value(page)) - put_page(page); + if (!xa_is_value(folio)) + folio_put(folio); } if (seek_data) start = -ENXIO; unlock: rcu_read_unlock(); - if (page && !xa_is_value(page)) - put_page(page); + if (folio && !xa_is_value(folio)) + folio_put(folio); if (start > end) return end; return start; @@ -2874,21 +2852,20 @@ unlock: #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* - * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock + * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock * @vmf - the vm_fault for this fault. - * @page - the page to lock. + * @folio - the folio to lock. * @fpin - the pointer to the file we may pin (or is already pinned). * - * This works similar to lock_page_or_retry in that it can drop the mmap_lock. - * It differs in that it actually returns the page locked if it returns 1 and 0 - * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin - * will point to the pinned file and needs to be fput()'ed at a later point. + * This works similar to lock_folio_or_retry in that it can drop the + * mmap_lock. It differs in that it actually returns the folio locked + * if it returns 1 and 0 if it couldn't lock the folio. If we did have + * to drop the mmap_lock then fpin will point to the pinned file and + * needs to be fput()'ed at a later point. */ -static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, +static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, struct file **fpin) { - struct folio *folio = page_folio(page); - if (folio_trylock(folio)) return 1; @@ -2977,25 +2954,25 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * was pinned if we have to drop the mmap_lock in order to do IO. */ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, - struct page *page) + struct folio *folio) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; - struct address_space *mapping = file->f_mapping; + DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; unsigned int mmap_miss; - pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; + mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); - if (PageReadahead(page)) { + + if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_async_readahead(mapping, ra, file, - page, offset, ra->ra_pages); + page_cache_async_ra(&ractl, folio, ra->ra_pages); } return fpin; } @@ -3014,7 +2991,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * vma->vm_mm->mmap_lock must be held on entry. * * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock - * may be dropped before doing I/O or by lock_page_maybe_drop_mmap(). + * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_lock * has not been released. @@ -3030,28 +3007,27 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - pgoff_t offset = vmf->pgoff; - pgoff_t max_off; - struct page *page; + pgoff_t max_idx, index = vmf->pgoff; + struct folio *folio; vm_fault_t ret = 0; bool mapping_locked = false; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) + max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; /* * Do we have something in the page cache already? */ - page = find_get_page(mapping, offset); - if (likely(page)) { + folio = filemap_get_folio(mapping, index); + if (likely(folio)) { /* * We found the page, so try async readahead before waiting for * the lock. */ if (!(vmf->flags & FAULT_FLAG_TRIED)) - fpin = do_async_mmap_readahead(vmf, page); - if (unlikely(!PageUptodate(page))) { + fpin = do_async_mmap_readahead(vmf, folio); + if (unlikely(!folio_test_uptodate(folio))) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } @@ -3063,17 +3039,17 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) fpin = do_sync_mmap_readahead(vmf); retry_find: /* - * See comment in filemap_create_page() why we need + * See comment in filemap_create_folio() why we need * invalidate_lock */ if (!mapping_locked) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } - page = pagecache_get_page(mapping, offset, + folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); - if (!page) { + if (!folio) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); @@ -3081,22 +3057,22 @@ retry_find: } } - if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) + if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) goto out_retry; /* Did it get truncated? */ - if (unlikely(compound_head(page)->mapping != mapping)) { - unlock_page(page); - put_page(page); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); goto retry_find; } - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. */ - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { /* * The page was in cache and uptodate and now it is not. * Strange but possible since we didn't hold the page lock all @@ -3104,8 +3080,8 @@ retry_find: * try again. */ if (!mapping_locked) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto retry_find; } goto page_not_uptodate; @@ -3117,7 +3093,7 @@ retry_find: * redo the fault. */ if (fpin) { - unlock_page(page); + folio_unlock(folio); goto out_retry; } if (mapping_locked) @@ -3127,14 +3103,14 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) { - unlock_page(page); - put_page(page); + max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(index >= max_idx)) { + folio_unlock(folio); + folio_put(folio); return VM_FAULT_SIGBUS; } - vmf->page = page; + vmf->page = folio_file_page(folio, index); return ret | VM_FAULT_LOCKED; page_not_uptodate: @@ -3145,10 +3121,10 @@ page_not_uptodate: * and we need to check for errors. */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); - error = filemap_read_page(file, mapping, page); + error = filemap_read_folio(file, mapping, folio); if (fpin) goto out_retry; - put_page(page); + folio_put(folio); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; @@ -3162,8 +3138,8 @@ out_retry: * re-find the vma and come back and find our hopefully still populated * page. */ - if (page) - put_page(page); + if (folio) + folio_put(folio); if (mapping_locked) filemap_invalidate_unlock_shared(mapping); if (fpin) @@ -3205,48 +3181,48 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) return false; } -static struct page *next_uptodate_page(struct page *page, +static struct folio *next_uptodate_page(struct folio *folio, struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { unsigned long max_idx; do { - if (!page) + if (!folio) return NULL; - if (xas_retry(xas, page)) + if (xas_retry(xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - if (PageLocked(page)) + if (folio_test_locked(folio)) continue; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) continue; /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(xas))) + if (unlikely(folio != xas_reload(xas))) goto skip; - if (!PageUptodate(page) || PageReadahead(page)) + if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) goto skip; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto skip; - if (page->mapping != mapping) + if (folio->mapping != mapping) goto unlock; - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (xas->xa_index >= max_idx) goto unlock; - return page; + return folio; unlock: - unlock_page(page); + folio_unlock(folio); skip: - put_page(page); - } while ((page = xas_next_entry(xas, end_pgoff)) != NULL); + folio_put(folio); + } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); return NULL; } -static inline struct page *first_map_page(struct address_space *mapping, +static inline struct folio *first_map_page(struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { @@ -3254,7 +3230,7 @@ static inline struct page *first_map_page(struct address_space *mapping, mapping, xas, end_pgoff); } -static inline struct page *next_map_page(struct address_space *mapping, +static inline struct folio *next_map_page(struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { @@ -3271,16 +3247,17 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct folio *folio; + struct page *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); vm_fault_t ret = 0; rcu_read_lock(); - head = first_map_page(mapping, &xas, end_pgoff); - if (!head) + folio = first_map_page(mapping, &xas, end_pgoff); + if (!folio) goto out; - if (filemap_map_pmd(vmf, head)) { + if (filemap_map_pmd(vmf, &folio->page)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3288,7 +3265,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); do { - page = find_subpage(head, xas.xa_index); +again: + page = folio_file_page(folio, xas.xa_index); if (PageHWPoison(page)) goto unlock; @@ -3309,12 +3287,21 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, do_set_pte(vmf, page, addr); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, addr, vmf->pte); - unlock_page(head); + if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } + folio_unlock(folio); continue; unlock: - unlock_page(head); - put_page(head); - } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); + if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { + xas.xa_index++; + goto again; + } + folio_unlock(folio); + folio_put(folio); + } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); @@ -3326,24 +3313,24 @@ EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } /* - * We mark the page dirty already here so that when freeze is in + * We mark the folio dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will - * see the dirty page and writeprotect it again. + * see the dirty folio and writeprotect it again. */ - set_page_dirty(page); - wait_for_stable_page(page); + folio_mark_dirty(folio); + folio_wait_stable(folio); out: sb_end_pagefault(mapping->host->i_sb); return ret; @@ -3396,35 +3383,20 @@ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); -static struct page *wait_on_page_read(struct page *page) +static struct folio *do_read_cache_folio(struct address_space *mapping, + pgoff_t index, filler_t filler, void *data, gfp_t gfp) { - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - put_page(page); - page = ERR_PTR(-EIO); - } - } - return page; -} - -static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data, - gfp_t gfp) -{ - struct page *page; + struct folio *folio; int err; repeat: - page = find_get_page(mapping, index); - if (!page) { - page = __page_cache_alloc(gfp); - if (!page) + folio = filemap_get_folio(mapping, index); + if (!folio) { + folio = filemap_alloc_folio(gfp, 0); + if (!folio) return ERR_PTR(-ENOMEM); - err = add_to_page_cache_lru(page, mapping, index, gfp); + err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { - put_page(page); + folio_put(folio); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for xarray node */ @@ -3433,71 +3405,41 @@ repeat: filler: if (filler) - err = filler(data, page); + err = filler(data, &folio->page); else - err = mapping->a_ops->readpage(data, page); + err = mapping->a_ops->readpage(data, &folio->page); if (err < 0) { - put_page(page); + folio_put(folio); return ERR_PTR(err); } - page = wait_on_page_read(page); - if (IS_ERR(page)) - return page; + folio_wait_locked(folio); + if (!folio_test_uptodate(folio)) { + folio_put(folio); + return ERR_PTR(-EIO); + } + goto out; } - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - /* - * Page is not up to date and may be locked due to one of the following - * case a: Page is being filled and the page lock is held - * case b: Read/write error clearing the page uptodate status - * case c: Truncation in progress (page locked) - * case d: Reclaim in progress - * - * Case a, the page will be up to date when the page is unlocked. - * There is no need to serialise on the page lock here as the page - * is pinned so the lock gives no additional protection. Even if the - * page is truncated, the data is still valid if PageUptodate as - * it's a race vs truncate race. - * Case b, the page will not be up to date - * Case c, the page may be truncated but in itself, the data may still - * be valid after IO completes as it's a read vs truncate race. The - * operation must restart if the page is not uptodate on unlock but - * otherwise serialising on page lock to stabilise the mapping gives - * no additional guarantees to the caller as the page lock is - * released before return. - * Case d, similar to truncation. If reclaim holds the page lock, it - * will be a race with remove_mapping that determines if the mapping - * is valid on unlock but otherwise the data is valid and there is - * no need to serialise with page lock. - * - * As the page lock gives no additional guarantee, we optimistically - * wait on the page to be unlocked and check if it's up to date and - * use the page if it is. Otherwise, the page lock is required to - * distinguish between the different cases. The motivation is that we - * avoid spurious serialisations and wakeups when multiple processes - * wait on the same page for IO to complete. - */ - wait_on_page_locked(page); - if (PageUptodate(page)) - goto out; - - /* Distinguish between all the cases under the safety of the lock */ - lock_page(page); + if (!folio_trylock(folio)) { + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); + goto repeat; + } - /* Case c or d, restart the operation */ - if (!page->mapping) { - unlock_page(page); - put_page(page); + /* Folio was truncated from mapping */ + if (!folio->mapping) { + folio_unlock(folio); + folio_put(folio); goto repeat; } /* Someone else locked and filled the page in a very small window */ - if (PageUptodate(page)) { - unlock_page(page); + if (folio_test_uptodate(folio)) { + folio_unlock(folio); goto out; } @@ -3507,16 +3449,16 @@ filler: * Clear page error before actual read, PG_error will be * set again if read page fails. */ - ClearPageError(page); + folio_clear_error(folio); goto filler; out: - mark_page_accessed(page); - return page; + folio_mark_accessed(folio); + return folio; } /** - * read_cache_page - read into page cache, fill it if needed + * read_cache_folio - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read @@ -3531,10 +3473,27 @@ out: * * Return: up to date page on success, ERR_PTR() on failure. */ +struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, + filler_t filler, void *data) +{ + return do_read_cache_folio(mapping, index, filler, data, + mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(read_cache_folio); + +static struct page *do_read_cache_page(struct address_space *mapping, + pgoff_t index, filler_t *filler, void *data, gfp_t gfp) +{ + struct folio *folio; + + folio = do_read_cache_folio(mapping, index, filler, data, gfp); + if (IS_ERR(folio)) + return &folio->page; + return folio_file_page(folio, index); +} + struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) + pgoff_t index, filler_t *filler, void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -3894,33 +3853,32 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) EXPORT_SYMBOL(generic_file_write_iter); /** - * try_to_release_page() - release old fs-specific metadata on a page - * - * @page: the page which the kernel is trying to free - * @gfp_mask: memory allocation flags (and I/O mode) + * filemap_release_folio() - Release fs-specific metadata on a folio. + * @folio: The folio which the kernel is trying to free. + * @gfp: Memory allocation flags (and I/O mode). * - * The address_space is to try to release any data against the page - * (presumably at page->private). + * The address_space is trying to release any data attached to a folio + * (presumably at folio->private). * - * This may also be called if PG_fscache is set on a page, indicating that the - * page is known to the local caching routines. + * This will also be called if the private_2 flag is set on a page, + * indicating that the folio has other metadata associated with it. * - * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). + * The @gfp argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block + * (__GFP_RECLAIM & __GFP_FS). * - * Return: %1 if the release was successful, otherwise return zero. + * Return: %true if the release was successful, otherwise %false. */ -int try_to_release_page(struct page *page, gfp_t gfp_mask) +bool filemap_release_folio(struct folio *folio, gfp_t gfp) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = folio->mapping; - BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) - return 0; + BUG_ON(!folio_test_locked(folio)); + if (folio_test_writeback(folio)) + return false; if (mapping && mapping->a_ops->releasepage) - return mapping->a_ops->releasepage(page, gfp_mask); - return try_to_free_buffers(page); + return mapping->a_ops->releasepage(&folio->page, gfp); + return try_to_free_buffers(&folio->page); } - -EXPORT_SYMBOL(try_to_release_page); +EXPORT_SYMBOL(filemap_release_folio); |