summaryrefslogtreecommitdiffstats
path: root/mm/madvise.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/madvise.c')
-rw-r--r--mm/madvise.c228
1 files changed, 128 insertions, 100 deletions
diff --git a/mm/madvise.c b/mm/madvise.c
index 1a073fcc4c0c..c8ba3f3eb54d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -321,6 +321,18 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
file_permission(vma->vm_file, MAY_WRITE) == 0;
}
+static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
+ struct folio *folio, pte_t *ptep,
+ pte_t pte, bool *any_young,
+ bool *any_dirty)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ int max_nr = (end - addr) / PAGE_SIZE;
+
+ return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+ any_young, any_dirty);
+}
+
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -336,6 +348,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
LIST_HEAD(folio_list);
bool pageout_anon_only_filter;
unsigned int batch_count = 0;
+ int nr;
if (fatal_signal_pending(current))
return -EINTR;
@@ -363,10 +376,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
goto huge_unlock;
}
- folio = pfn_folio(pmd_pfn(orig_pmd));
+ folio = pmd_folio(orig_pmd);
/* Do not interfere with other mappings of this folio */
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_likely_mapped_shared(folio))
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -410,7 +423,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
huge_unlock:
spin_unlock(ptl);
if (pageout)
- reclaim_pages(&folio_list, true);
+ reclaim_pages(&folio_list);
return 0;
}
@@ -423,7 +436,8 @@ restart:
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
- for (; addr < end; pte++, addr += PAGE_SIZE) {
+ for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
+ nr = 1;
ptent = ptep_get(pte);
if (++batch_count == SWAP_CLUSTER_MAX) {
@@ -447,55 +461,64 @@ restart:
continue;
/*
- * Creating a THP page is expensive so split it only if we
- * are sure it's worth. Split it if we are only owner.
+ * If we encounter a large folio, only split it if it is not
+ * fully mapped within the range we are operating on. Otherwise
+ * leave it as is so that it can be swapped out whole. If we
+ * fail to split a folio, leave it in place and advance to the
+ * next pte in the range.
*/
if (folio_test_large(folio)) {
- int err;
-
- if (folio_estimated_sharers(folio) > 1)
- break;
- if (pageout_anon_only_filter && !folio_test_anon(folio))
- break;
- if (!folio_trylock(folio))
- break;
- folio_get(folio);
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- start_pte = NULL;
- err = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (err)
- break;
- start_pte = pte =
- pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!start_pte)
- break;
- arch_enter_lazy_mmu_mode();
- pte--;
- addr -= PAGE_SIZE;
- continue;
+ bool any_young;
+
+ nr = madvise_folio_pte_batch(addr, end, folio, pte,
+ ptent, &any_young, NULL);
+ if (any_young)
+ ptent = pte_mkyoung(ptent);
+
+ if (nr < folio_nr_pages(folio)) {
+ int err;
+
+ if (folio_likely_mapped_shared(folio))
+ continue;
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
+ continue;
+ if (!folio_trylock(folio))
+ continue;
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ start_pte = pte =
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
+ if (!err)
+ nr = 0;
+ continue;
+ }
}
/*
* Do not interfere with other mappings of this folio and
- * non-LRU folio.
+ * non-LRU folio. If we have a large folio at this point, we
+ * know it is fully mapped so if its mapcount is the same as its
+ * number of pages, it must be exclusive.
*/
- if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+ if (!folio_test_lru(folio) ||
+ folio_mapcount(folio) != folio_nr_pages(folio))
continue;
if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
if (!pageout && pte_young(ptent)) {
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- ptent = pte_mkold(ptent);
- set_pte_at(mm, addr, pte, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
+ clear_young_dirty_ptes(vma, addr, pte, nr,
+ CYDP_CLEAR_YOUNG);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
}
/*
@@ -524,7 +547,7 @@ restart:
pte_unmap_unlock(start_pte, ptl);
}
if (pageout)
- reclaim_pages(&folio_list, true);
+ reclaim_pages(&folio_list);
cond_resched();
return 0;
@@ -620,6 +643,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
+ const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
struct mmu_gather *tlb = walk->private;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
@@ -628,6 +652,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct folio *folio;
int nr_swap = 0;
unsigned long next;
+ int nr, max_nr;
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
@@ -640,7 +665,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
@@ -655,9 +681,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry)) {
- nr_swap--;
- free_swap_and_cache(entry);
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ nr_swap -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
} else if (is_hwpoison_entry(entry) ||
is_poisoned_swp_entry(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
@@ -670,44 +698,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
/*
- * If pmd isn't transhuge but the folio is large and
- * is owned by only this process, split it and
- * deactivate all pages.
+ * If we encounter a large folio, only split it if it is not
+ * fully mapped within the range we are operating on. Otherwise
+ * leave it as is so that it can be marked as lazyfree. If we
+ * fail to split a folio, leave it in place and advance to the
+ * next pte in the range.
*/
if (folio_test_large(folio)) {
- int err;
+ bool any_young, any_dirty;
- if (folio_estimated_sharers(folio) != 1)
- break;
- if (!folio_trylock(folio))
- break;
- folio_get(folio);
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- start_pte = NULL;
- err = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (err)
- break;
- start_pte = pte =
- pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!start_pte)
- break;
- arch_enter_lazy_mmu_mode();
- pte--;
- addr -= PAGE_SIZE;
- continue;
+ nr = madvise_folio_pte_batch(addr, end, folio, pte,
+ ptent, &any_young, &any_dirty);
+
+ if (nr < folio_nr_pages(folio)) {
+ int err;
+
+ if (folio_likely_mapped_shared(folio))
+ continue;
+ if (!folio_trylock(folio))
+ continue;
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ start_pte = pte;
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
+ if (!err)
+ nr = 0;
+ continue;
+ }
+
+ if (any_young)
+ ptent = pte_mkyoung(ptent);
+ if (any_dirty)
+ ptent = pte_mkdirty(ptent);
}
if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
if (!folio_trylock(folio))
continue;
/*
- * If folio is shared with others, we mustn't clear
- * the folio's dirty flag.
+ * If we have a large folio at this point, we know it is
+ * fully mapped so if its mapcount is the same as its
+ * number of pages, it must be exclusive.
*/
- if (folio_mapcount(folio) != 1) {
+ if (folio_mapcount(folio) != folio_nr_pages(folio)) {
folio_unlock(folio);
continue;
}
@@ -723,19 +764,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
if (pte_young(ptent) || pte_dirty(ptent)) {
- /*
- * Some of architecture(ex, PPC) don't update TLB
- * with set_pte_at and tlb_remove_tlb_entry so for
- * the portability, remap the pte with old|clean
- * after pte clearing.
- */
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
-
- ptent = pte_mkold(ptent);
- ptent = pte_mkclean(ptent);
- set_pte_at(mm, addr, pte, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
+ clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
}
folio_mark_lazyfree(folio);
}
@@ -901,26 +931,19 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
return -EINVAL;
}
-static long madvise_populate(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- int behavior)
+static long madvise_populate(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int behavior)
{
const bool write = behavior == MADV_POPULATE_WRITE;
- struct mm_struct *mm = vma->vm_mm;
int locked = 1;
long pages;
- *prev = vma;
-
while (start < end) {
/* Populate (prefault) page tables readable/writable. */
pages = faultin_page_range(mm, start, end, write, &locked);
if (!locked) {
mmap_read_lock(mm);
locked = 1;
- *prev = NULL;
- vma = NULL;
}
if (pages < 0) {
switch (pages) {
@@ -1021,9 +1044,6 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- return madvise_populate(vma, prev, start, end, behavior);
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -1425,8 +1445,16 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
end = start + len;
blk_start_plug(&plug);
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
+ switch (behavior) {
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ error = madvise_populate(mm, start, end, behavior);
+ break;
+ default:
+ error = madvise_walk_vmas(mm, start, end, behavior,
+ madvise_vma_behavior);
+ break;
+ }
blk_finish_plug(&plug);
if (write)
mmap_write_unlock(mm);