diff options
-rw-r--r-- | fs/userfaultfd.c | 11 | ||||
-rw-r--r-- | mm/filemap.c | 5 | ||||
-rw-r--r-- | mm/hmm.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 23 | ||||
-rw-r--r-- | mm/mincore.c | 3 | ||||
-rw-r--r-- | mm/mprotect.c | 3 |
7 files changed, 47 insertions, 8 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index aa0c47cb0d16..78b68e0f9774 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -29,6 +29,7 @@ #include <linux/ioctl.h> #include <linux/security.h> #include <linux/hugetlb.h> +#include <linux/swapops.h> int sysctl_unprivileged_userfaultfd __read_mostly; @@ -249,9 +250,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. + * changes under us. PTE markers should be handled the same as none + * ptes here. */ - if (huge_pte_none(pte)) + if (huge_pte_none_mostly(pte)) ret = true; if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) ret = true; @@ -330,9 +332,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pte = pte_offset_map(pmd, address); /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. + * changes under us. PTE markers should be handled the same as none + * ptes here. */ - if (pte_none(*pte)) + if (pte_none_mostly(*pte)) ret = true; if (!pte_write(*pte) && (reason & VM_UFFD_WP)) ret = true; diff --git a/mm/filemap.c b/mm/filemap.c index 9a1eef6c5d35..de34c6c10384 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3376,6 +3376,11 @@ again: vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; + /* + * NOTE: If there're PTE markers, we'll leave them to be + * handled in the specific fault path, and it'll prohibit the + * fault-around logic. + */ if (!pte_none(*vmf->pte)) goto unlock; @@ -239,7 +239,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, pte_t pte = *ptep; uint64_t pfn_req_flags = *hmm_pfn; - if (pte_none(pte)) { + if (pte_none_mostly(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 235c85e17c2d..6e74ca98c862 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5640,10 +5640,14 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (pte_present(ptent)) page = mc_handle_present_pte(vma, addr, ptent); + else if (pte_none_mostly(ptent)) + /* + * PTE markers should be treated as a none pte here, separated + * from other swap handling below. + */ + page = mc_handle_file_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); - else if (pte_none(ptent)) - page = mc_handle_file_pte(vma, addr, ptent); if (!page && !ent.val) return ret; diff --git a/mm/memory.c b/mm/memory.c index 90212057a546..9743c8b74bf2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -100,6 +100,8 @@ struct page *mem_map; EXPORT_SYMBOL(mem_map); #endif +static vm_fault_t do_fault(struct vm_fault *vmf); + /* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end @@ -1415,6 +1417,8 @@ again: if (!should_zap_page(details, page)) continue; rss[mm_counter(page)]--; + } else if (is_pte_marker_entry(entry)) { + /* By default, simply drop all pte markers when zap */ } else if (is_hwpoison_entry(entry)) { if (!should_zap_cows(details)) continue; @@ -3555,6 +3559,23 @@ static inline bool should_try_to_free_swap(struct page *page, page_count(page) == 2; } +static vm_fault_t handle_pte_marker(struct vm_fault *vmf) +{ + swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); + unsigned long marker = pte_marker_get(entry); + + /* + * PTE markers should always be with file-backed memories, and the + * marker should never be empty. If anything weird happened, the best + * thing to do is to kill the process along with its mm. + */ + if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) + return VM_FAULT_SIGBUS; + + /* TODO: handle pte markers */ + return 0; +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3592,6 +3613,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; + } else if (is_pte_marker_entry(entry)) { + ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; diff --git a/mm/mincore.c b/mm/mincore.c index f4f627325e12..fa200c14185f 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -122,7 +122,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (; addr != end; ptep++, addr += PAGE_SIZE) { pte_t pte = *ptep; - if (pte_none(pte)) + /* We need to do cache lookup too for pte markers */ + if (pte_none_mostly(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); else if (pte_present(pte)) diff --git a/mm/mprotect.c b/mm/mprotect.c index 20a46f21cca8..e84694267b0f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -193,6 +193,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, newpte = pte_swp_mksoft_dirty(newpte); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_pte_marker_entry(entry)) { + /* Skip it, the same as none pte */ + continue; } else { newpte = oldpte; } |