diff options
Diffstat (limited to 'mm/internal.h')
-rw-r--r-- | mm/internal.h | 205 |
1 files changed, 156 insertions, 49 deletions
diff --git a/mm/internal.h b/mm/internal.h index 07ad2675a88b..2adabe369403 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,8 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <linux/tracepoint-defs.h> struct folio_batch; @@ -70,13 +72,30 @@ void page_writeback_init(void); /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. + * + * Don't use this function outside of debugging code. */ -static inline int folio_nr_pages_mapped(struct folio *folio) +static inline int folio_nr_pages_mapped(const struct folio *folio) { return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } -static inline void *folio_raw_mapping(struct folio *folio) +/* + * Retrieve the first entry of a folio based on a provided entry within the + * folio. We cannot rely on folio->swap as there is no guarantee that it has + * been initialized. Used for calling arch_swap_restore() + */ +static inline swp_entry_t folio_swap(swp_entry_t entry, + const struct folio *folio) +{ + swp_entry_t swap = { + .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), + }; + + return swap; +} + +static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; @@ -113,6 +132,10 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * @flags: Flags to modify the PTE batch semantics. * @any_writable: Optional pointer to indicate whether any entry except the * first one is writable. + * @any_young: Optional pointer to indicate whether any entry except the + * first one is young. + * @any_dirty: Optional pointer to indicate whether any entry except the + * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio. @@ -128,16 +151,20 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, - bool *any_writable) + bool *any_writable, bool *any_young, bool *any_dirty) { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; - bool writable; + bool writable, young, dirty; int nr; if (any_writable) *any_writable = false; + if (any_young) + *any_young = false; + if (any_dirty) + *any_dirty = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); @@ -151,6 +178,10 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); + if (any_young) + young = !!pte_young(pte); + if (any_dirty) + dirty = !!pte_dirty(pte); pte = __pte_batch_clear_ignored(pte, flags); if (!pte_same(pte, expected_pte)) @@ -166,6 +197,10 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, if (any_writable) *any_writable |= writable; + if (any_young) + *any_young |= young; + if (any_dirty) + *any_dirty |= dirty; nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, nr); @@ -174,6 +209,68 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, return min(ptep - start_ptep, max_nr); } + +/** + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. + * @pte: The initial pte state; is_swap_pte(pte) must be true and + * non_swap_entry() must be false. + * + * Increments the swap offset, while maintaining all other fields, including + * swap type, and any swp pte bits. The resulting pte is returned. + */ +static inline pte_t pte_next_swp_offset(pte_t pte) +{ + swp_entry_t entry = pte_to_swp_entry(pte); + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), + (swp_offset(entry) + 1))); + + if (pte_swp_soft_dirty(pte)) + new = pte_swp_mksoft_dirty(new); + if (pte_swp_exclusive(pte)) + new = pte_swp_mkexclusive(new); + if (pte_swp_uffd_wp(pte)) + new = pte_swp_mkuffd_wp(new); + + return new; +} + +/** + * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries + * @start_ptep: Page table pointer for the first entry. + * @max_nr: The maximum number of table entries to consider. + * @pte: Page table entry for the first entry. + * + * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs + * containing swap entries all with consecutive offsets and targeting the same + * swap type, all with matching swp pte bits. + * + * max_nr must be at least one and must be limited by the caller so scanning + * cannot exceed a single page table. + * + * Return: the number of table entries in the batch. + */ +static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) +{ + pte_t expected_pte = pte_next_swp_offset(pte); + const pte_t *end_ptep = start_ptep + max_nr; + pte_t *ptep = start_ptep + 1; + + VM_WARN_ON(max_nr < 1); + VM_WARN_ON(!is_swap_pte(pte)); + VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); + + while (ptep < end_ptep) { + pte = ptep_get(ptep); + + if (!pte_same(pte, expected_pte)) + break; + + expected_pte = pte_next_swp_offset(expected_pte); + ptep++; + } + + return ptep - start_ptep; +} #endif /* CONFIG_MMU */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, @@ -491,6 +588,7 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); +extern void kernel_init_pages(struct page *page, int numpages); /* * This will have no effect, other than possibly generating a warning, if the @@ -513,7 +611,8 @@ static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; - folio_prep_large_rmappable(folio); + if (folio && folio_test_large(folio)) + folio_set_large_rmappable(folio); return folio; } @@ -522,9 +621,12 @@ static inline void prep_compound_head(struct page *page, unsigned int order) struct folio *folio = (struct folio *)page; folio_set_order(folio, order); + atomic_set(&folio->_large_mapcount, -1); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); + if (order > 1) + INIT_LIST_HEAD(&folio->_deferred_list); } static inline void prep_compound_tail(struct page *head, int tail_idx) @@ -559,10 +661,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); - -int split_free_page(struct page *free_page, - unsigned int order, unsigned long split_pfn_offset); - #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -789,13 +887,17 @@ void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); -/* - * Return the start of user virtual address at the specific offset within - * a vma. +/** + * vma_address - Find the virtual address a page range is mapped at + * @vma: The vma which maps this object. + * @pgoff: The page offset within its object. + * @nr_pages: The number of pages to consider. + * + * If any page in this range is mapped by this VMA, return the first address + * where any of these pages appear. Otherwise, return -EFAULT. */ -static inline unsigned long -vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, - struct vm_area_struct *vma) +static inline unsigned long vma_address(struct vm_area_struct *vma, + pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; @@ -815,18 +917,6 @@ vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, } /* - * Return the start of user virtual address of a page within a vma. - * Returns -EFAULT if all of the page is outside the range of vma. - * If page is a compound head, the entire compound page is considered. - */ -static inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) -{ - VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ - return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma); -} - -/* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. */ @@ -947,6 +1037,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) /* * mm/memory-failure.c */ +void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; @@ -961,7 +1052,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references); +unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ @@ -1040,17 +1131,13 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype) return migratetype == MIGRATE_HIGHATOMIC; } -static inline bool is_migrate_highatomic_page(struct page *page) -{ - return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; + enum migrate_reason reason; }; /* @@ -1087,10 +1174,10 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); -int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int page_nid, int *flags); -void free_zone_device_page(struct page *page); +void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_page(struct page *page); /* @@ -1102,9 +1189,10 @@ int __must_check try_grab_page(struct page *page, unsigned int flags); /* * mm/huge_memory.c */ -struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, - unsigned int flags); +void touch_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, bool write); +void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, bool write); /* * mm/mmap.c @@ -1189,20 +1277,10 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, } /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_rmb(); /* - * During GUP-fast we might not get called on the head page for a - * hugetlb page that is mapped using cont-PTE, because GUP-fast does - * not work with the abstracted hugetlb PTEs that always point at the - * head page. For hugetlb, PageAnonExclusive only applies on the head - * page (as it cannot be partially COW-shared), so lookup the head page. - */ - if (unlikely(!PageHead(page) && PageHuge(page))) - page = compound_head(page); - - /* * Note that PageKsm() pages cannot be exclusive, and consequently, * cannot get pinned. */ @@ -1245,6 +1323,35 @@ static inline void vma_iter_config(struct vma_iterator *vmi, __mas_set_range(&vmi->mas, index, last - 1); } +static inline void vma_iter_reset(struct vma_iterator *vmi) +{ + mas_reset(&vmi->mas); +} + +static inline +struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) +{ + return mas_prev_range(&vmi->mas, min); +} + +static inline +struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) +{ + return mas_next_range(&vmi->mas, max); +} + +static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, + unsigned long max, unsigned long size) +{ + return mas_empty_area(&vmi->mas, min, max - 1, size); +} + +static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, + unsigned long max, unsigned long size) +{ + return mas_empty_area_rev(&vmi->mas, min, max - 1, size); +} + /* * VMA Iterator functions shared between nommu and mmap */ |