summaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c428
1 files changed, 313 insertions, 115 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 410bbb0aee32..7c468ac1d069 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -370,7 +370,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
}
static inline long
-hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
+hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
long to, struct hstate *h, struct hugetlb_cgroup *cg,
long *regions_needed)
{
@@ -379,7 +379,7 @@ hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(map, from, to);
record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
- list_add(&nrg->link, rg->link.prev);
+ list_add(&nrg->link, rg);
coalesce_file_region(map, nrg);
} else
*regions_needed += 1;
@@ -402,47 +402,52 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
long add = 0;
struct list_head *head = &resv->regions;
long last_accounted_offset = f;
- struct file_region *rg = NULL, *trg = NULL;
+ struct file_region *iter, *trg = NULL;
+ struct list_head *rg = NULL;
if (regions_needed)
*regions_needed = 0;
/* In this loop, we essentially handle an entry for the range
- * [last_accounted_offset, rg->from), at every iteration, with some
+ * [last_accounted_offset, iter->from), at every iteration, with some
* bounds checking.
*/
- list_for_each_entry_safe(rg, trg, head, link) {
+ list_for_each_entry_safe(iter, trg, head, link) {
/* Skip irrelevant regions that start before our range. */
- if (rg->from < f) {
+ if (iter->from < f) {
/* If this region ends after the last accounted offset,
* then we need to update last_accounted_offset.
*/
- if (rg->to > last_accounted_offset)
- last_accounted_offset = rg->to;
+ if (iter->to > last_accounted_offset)
+ last_accounted_offset = iter->to;
continue;
}
/* When we find a region that starts beyond our range, we've
* finished.
*/
- if (rg->from >= t)
+ if (iter->from >= t) {
+ rg = iter->link.prev;
break;
+ }
- /* Add an entry for last_accounted_offset -> rg->from, and
+ /* Add an entry for last_accounted_offset -> iter->from, and
* update last_accounted_offset.
*/
- if (rg->from > last_accounted_offset)
- add += hugetlb_resv_map_add(resv, rg,
+ if (iter->from > last_accounted_offset)
+ add += hugetlb_resv_map_add(resv, iter->link.prev,
last_accounted_offset,
- rg->from, h, h_cg,
+ iter->from, h, h_cg,
regions_needed);
- last_accounted_offset = rg->to;
+ last_accounted_offset = iter->to;
}
/* Handle the case where our range extends beyond
* last_accounted_offset.
*/
+ if (!rg)
+ rg = head->prev;
if (last_accounted_offset < t)
add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
t, h, h_cg, regions_needed);
@@ -1535,7 +1540,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
- if (alloc_huge_page_vmemmap(h, page)) {
+ if (hugetlb_vmemmap_alloc(h, page)) {
spin_lock_irq(&hugetlb_lock);
/*
* If we cannot allocate vmemmap pages, just refuse to free the
@@ -1612,7 +1617,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
static inline void flush_free_hpage_work(struct hstate *h)
{
- if (free_vmemmap_pages_per_hpage(h))
+ if (hugetlb_optimize_vmemmap_pages(h))
flush_work(&free_hpage_work);
}
@@ -1672,6 +1677,8 @@ void free_huge_page(struct page *page)
VM_BUG_ON_PAGE(page_mapcount(page), page);
hugetlb_set_page_subpool(page, NULL);
+ if (PageAnon(page))
+ __ClearPageAnonExclusive(page);
page->mapping = NULL;
restore_reserve = HPageRestoreReserve(page);
ClearHPageRestoreReserve(page);
@@ -1732,7 +1739,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
static void __prep_new_huge_page(struct hstate *h, struct page *page)
{
- free_huge_page_vmemmap(h, page);
+ hugetlb_vmemmap_free(h, page);
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
@@ -2105,7 +2112,7 @@ retry:
* Attempt to allocate vmemmmap here so that we can take
* appropriate action on failure.
*/
- rc = alloc_huge_page_vmemmap(h, head);
+ rc = hugetlb_vmemmap_alloc(h, head);
if (!rc) {
/*
* Move PageHWPoison flag from head page to the raw
@@ -2979,8 +2986,6 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
struct huge_bootmem_page *m = NULL; /* initialize for clang */
int nr_nodes, node;
- if (nid != NUMA_NO_NODE && nid >= nr_online_nodes)
- return 0;
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
@@ -3088,7 +3093,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
}
/* do node specific alloc */
- for (i = 0; i < nr_online_nodes; i++) {
+ for_each_online_node(i) {
if (h->max_huge_pages_node[i] > 0) {
hugetlb_hstate_alloc_pages_onenode(h, i);
node_specific_alloc = true;
@@ -3420,7 +3425,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
remove_hugetlb_page_for_demote(h, page, false);
spin_unlock_irq(&hugetlb_lock);
- rc = alloc_huge_page_vmemmap(h, page);
+ rc = hugetlb_vmemmap_alloc(h, page);
if (rc) {
/* Allocation of vmemmmap failed, we can not demote page */
spin_lock_irq(&hugetlb_lock);
@@ -4052,7 +4057,7 @@ static int __init hugetlb_init(void)
default_hstate.max_huge_pages =
default_hstate_max_huge_pages;
- for (i = 0; i < nr_online_nodes; i++)
+ for_each_online_node(i)
default_hstate.max_huge_pages_node[i] =
default_hugepages_in_node[i];
}
@@ -4119,6 +4124,20 @@ bool __init __weak hugetlb_node_alloc_supported(void)
{
return true;
}
+
+static void __init hugepages_clear_pages_in_node(void)
+{
+ if (!hugetlb_max_hstate) {
+ default_hstate_max_huge_pages = 0;
+ memset(default_hugepages_in_node, 0,
+ MAX_NUMNODES * sizeof(unsigned int));
+ } else {
+ parsed_hstate->max_huge_pages = 0;
+ memset(parsed_hstate->max_huge_pages_node, 0,
+ MAX_NUMNODES * sizeof(unsigned int));
+ }
+}
+
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -4138,7 +4157,7 @@ static int __init hugepages_setup(char *s)
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 0;
+ return 1;
}
/*
@@ -4154,7 +4173,7 @@ static int __init hugepages_setup(char *s)
if (mhp == last_mhp) {
pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
- return 0;
+ return 1;
}
while (*p) {
@@ -4165,11 +4184,11 @@ static int __init hugepages_setup(char *s)
if (p[count] == ':') {
if (!hugetlb_node_alloc_supported()) {
pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
- return 0;
+ return 1;
}
- if (tmp >= nr_online_nodes)
+ if (tmp >= MAX_NUMNODES || !node_online(tmp))
goto invalid;
- node = array_index_nospec(tmp, nr_online_nodes);
+ node = array_index_nospec(tmp, MAX_NUMNODES);
p += count + 1;
/* Parse hugepages */
if (sscanf(p, "%lu%n", &tmp, &count) != 1)
@@ -4206,7 +4225,8 @@ static int __init hugepages_setup(char *s)
invalid:
pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
- return 0;
+ hugepages_clear_pages_in_node();
+ return 1;
}
__setup("hugepages=", hugepages_setup);
@@ -4227,7 +4247,7 @@ static int __init hugepagesz_setup(char *s)
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
- return 0;
+ return 1;
}
h = size_to_hstate(size);
@@ -4242,7 +4262,7 @@ static int __init hugepagesz_setup(char *s)
if (!parsed_default_hugepagesz || h != &default_hstate ||
default_hstate.max_huge_pages) {
pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
- return 0;
+ return 1;
}
/*
@@ -4273,14 +4293,14 @@ static int __init default_hugepagesz_setup(char *s)
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
- return 0;
+ return 1;
}
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
- return 0;
+ return 1;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@ -4297,7 +4317,7 @@ static int __init default_hugepagesz_setup(char *s)
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
- for (i = 0; i < nr_online_nodes; i++)
+ for_each_online_node(i)
default_hstate.max_huge_pages_node[i] =
default_hugepages_in_node[i];
if (hstate_is_gigantic(&default_hstate))
@@ -4699,24 +4719,27 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma)
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
{
pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
- bool cow = is_cow_mapping(vma->vm_flags);
- struct hstate *h = hstate_vma(vma);
+ bool cow = is_cow_mapping(src_vma->vm_flags);
+ struct hstate *h = hstate_vma(src_vma);
unsigned long sz = huge_page_size(h);
unsigned long npages = pages_per_huge_page(h);
- struct address_space *mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = src_vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
if (cow) {
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
- vma->vm_start,
- vma->vm_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
+ src_vma->vm_start,
+ src_vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
+ mmap_assert_write_locked(src);
+ raw_write_seqcount_begin(&src->write_protect_seq);
} else {
/*
* For shared mappings i_mmap_rwsem must be held to call
@@ -4727,12 +4750,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
i_mmap_lock_read(mapping);
}
- for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+ for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
- dst_pte = huge_pte_alloc(dst, vma, addr, sz);
+ dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
break;
@@ -4767,8 +4790,9 @@ again:
} else if (unlikely(is_hugetlb_entry_migration(entry) ||
is_hugetlb_entry_hwpoisoned(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
+ bool uffd_wp = huge_pte_uffd_wp(entry);
- if (is_writable_migration_entry(swp_entry) && cow) {
+ if (!is_readable_migration_entry(swp_entry) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
@@ -4776,38 +4800,53 @@ again:
swp_entry = make_readable_migration_entry(
swp_offset(swp_entry));
entry = swp_entry_to_pte(swp_entry);
+ if (userfaultfd_wp(src_vma) && uffd_wp)
+ entry = huge_pte_mkuffd_wp(entry);
set_huge_swap_pte_at(src, addr, src_pte,
entry, sz);
}
+ if (!userfaultfd_wp(dst_vma) && uffd_wp)
+ entry = huge_pte_clear_uffd_wp(entry);
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+ } else if (unlikely(is_pte_marker(entry))) {
+ /*
+ * We copy the pte marker only if the dst vma has
+ * uffd-wp enabled.
+ */
+ if (userfaultfd_wp(dst_vma))
+ set_huge_pte_at(dst, addr, dst_pte, entry);
} else {
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
/*
- * This is a rare case where we see pinned hugetlb
- * pages while they're prone to COW. We need to do the
- * COW earlier during fork.
+ * Failing to duplicate the anon rmap is a rare case
+ * where we see pinned hugetlb pages while they're
+ * prone to COW. We need to do the COW earlier during
+ * fork.
*
* When pre-allocating the page or copying data, we
* need to be without the pgtable locks since we could
* sleep during the process.
*/
- if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+ if (!PageAnon(ptepage)) {
+ page_dup_file_rmap(ptepage, true);
+ } else if (page_try_dup_anon_rmap(ptepage, true,
+ src_vma)) {
pte_t src_pte_old = entry;
struct page *new;
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
- new = alloc_huge_page(vma, addr, 1);
+ new = alloc_huge_page(dst_vma, addr, 1);
if (IS_ERR(new)) {
put_page(ptepage);
ret = PTR_ERR(new);
break;
}
- copy_user_huge_page(new, ptepage, addr, vma,
+ copy_user_huge_page(new, ptepage, addr, dst_vma,
npages);
put_page(ptepage);
@@ -4817,13 +4856,13 @@ again:
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
- restore_reserve_on_error(h, vma, addr,
+ restore_reserve_on_error(h, dst_vma, addr,
new);
put_page(new);
/* dst_entry won't change as in child */
goto again;
}
- hugetlb_install_page(vma, dst_pte, addr, new);
+ hugetlb_install_page(dst_vma, dst_pte, addr, new);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
continue;
@@ -4841,7 +4880,6 @@ again:
entry = huge_pte_wrprotect(entry);
}
- page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(npages, dst);
}
@@ -4849,10 +4887,12 @@ again:
spin_unlock(dst_ptl);
}
- if (cow)
+ if (cow) {
+ raw_write_seqcount_end(&src->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
- else
+ } else {
i_mmap_unlock_read(mapping);
+ }
return ret;
}
@@ -4896,10 +4936,17 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long old_addr_copy;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
+ bool shared_pmd = false;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ /*
+ * In case of shared PMDs, we should cover the maximum possible
+ * range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+
mmu_notifier_invalidate_range_start(&range);
/* Prevent race with file truncation */
i_mmap_lock_write(mapping);
@@ -4916,8 +4963,10 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
*/
old_addr_copy = old_addr;
- if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
+ if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+ shared_pmd = true;
continue;
+ }
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
if (!dst_pte)
@@ -4925,7 +4974,11 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
}
- flush_tlb_range(vma, old_end - len, old_end);
+
+ if (shared_pmd)
+ flush_tlb_range(vma, range.start, range.end);
+ else
+ flush_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
@@ -4934,7 +4987,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- struct page *ref_page)
+ struct page *ref_page, zap_flags_t zap_flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -4990,7 +5043,18 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
- huge_pte_clear(mm, address, ptep, sz);
+ /*
+ * If the pte was wr-protected by uffd-wp in any of the
+ * swap forms, meanwhile the caller does not want to
+ * drop the uffd-wp bit in this zap, then replace the
+ * pte with a marker.
+ */
+ if (pte_swp_uffd_wp_any(pte) &&
+ !(zap_flags & ZAP_FLAG_DROP_MARKER))
+ set_huge_pte_at(mm, address, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ else
+ huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
}
@@ -5018,7 +5082,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
-
+ /* Leave a uffd-wp pte marker if needed */
+ if (huge_pte_uffd_wp(pte) &&
+ !(zap_flags & ZAP_FLAG_DROP_MARKER))
+ set_huge_pte_at(mm, address, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, vma, true);
@@ -5052,9 +5120,10 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+ unsigned long end, struct page *ref_page,
+ zap_flags_t zap_flags)
{
- __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+ __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
/*
* Clear this flag so that x86's huge_pmd_share page_table_shareable
@@ -5070,12 +5139,13 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+ unsigned long end, struct page *ref_page,
+ zap_flags_t zap_flags)
{
struct mmu_gather tlb;
tlb_gather_mmu(&tlb, vma->vm_mm);
- __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
tlb_finish_mmu(&tlb);
}
@@ -5130,21 +5200,22 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
unmap_hugepage_range(iter_vma, address,
- address + huge_page_size(h), page);
+ address + huge_page_size(h), page, 0);
}
i_mmap_unlock_write(mapping);
}
/*
- * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * hugetlb_wp() should be called with page lock of the original hugepage held.
* Called with hugetlb_fault_mutex_table held and pte_page locked so we
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
-static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep,
+static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, unsigned int flags,
struct page *pagecache_page, spinlock_t *ptl)
{
+ const bool unshare = flags & FAULT_FLAG_UNSHARE;
pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
@@ -5153,17 +5224,26 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long haddr = address & huge_page_mask(h);
struct mmu_notifier_range range;
+ VM_BUG_ON(unshare && (flags & FOLL_WRITE));
+ VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
+
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
retry_avoidcopy:
- /* If no-one else is actually using this page, avoid the copy
- * and just make the page writable */
+ /*
+ * If no-one else is actually using this page, we're the exclusive
+ * owner and can reuse this page.
+ */
if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
- page_move_anon_rmap(old_page, vma);
- set_huge_ptep_writable(vma, haddr, ptep);
+ if (!PageAnonExclusive(old_page))
+ page_move_anon_rmap(old_page, vma);
+ if (likely(!unshare))
+ set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
+ VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
+ old_page);
/*
* If the process that created a MAP_PRIVATE mapping is about to
@@ -5262,13 +5342,13 @@ retry_avoidcopy:
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearHPageRestoreReserve(new_page);
- /* Break COW */
+ /* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
page_remove_rmap(old_page, vma, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
set_huge_pte_at(mm, haddr, ptep,
- make_huge_pte(vma, new_page, 1));
+ make_huge_pte(vma, new_page, !unshare));
SetHPageMigratable(new_page);
/* Make the old page be freed below */
new_page = old_page;
@@ -5276,7 +5356,10 @@ retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
- /* No restore in case of successful pagetable update (Break COW) */
+ /*
+ * No restore in case of successful pagetable update (Break COW or
+ * unshare)
+ */
if (new_page != old_page)
restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
@@ -5386,7 +5469,8 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
- unsigned long address, pte_t *ptep, unsigned int flags)
+ unsigned long address, pte_t *ptep,
+ pte_t old_pte, unsigned int flags)
{
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
@@ -5401,7 +5485,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
/*
* Currently, we are forced to kill the process in the event the
* original mapper has unmapped pages from the child due to a failed
- * COW. Warn that such a situation has occurred as it may not be obvious
+ * COW/unsharing. Warn that such a situation has occurred as it may not
+ * be obvious.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
@@ -5512,22 +5597,29 @@ retry:
ptl = huge_pte_lock(h, mm, ptep);
ret = 0;
- if (!huge_pte_none(huge_ptep_get(ptep)))
+ /* If pte changed from under us, retry */
+ if (!pte_same(huge_ptep_get(ptep), old_pte))
goto backout;
if (anon_rmap) {
ClearHPageRestoreReserve(page);
hugepage_add_new_anon_rmap(page, vma, haddr);
} else
- page_dup_rmap(page, true);
+ page_dup_file_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
+ /*
+ * If this pte was previously wr-protected, keep it wr-protected even
+ * if populated.
+ */
+ if (unlikely(pte_marker_uffd_wp(old_pte)))
+ new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
+ ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
}
spin_unlock(ptl);
@@ -5639,8 +5731,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
- if (huge_pte_none(entry)) {
- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
+ /* PTE markers should be handled the same way as none pte */
+ if (huge_pte_none_mostly(entry)) {
+ ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+ entry, flags);
goto out_mutex;
}
@@ -5657,14 +5751,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_mutex;
/*
- * If we are going to COW the mapping later, we examine the pending
- * reservations for this page now. This will ensure that any
+ * If we are going to COW/unshare the mapping later, we examine the
+ * pending reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
* spinlock. For private mappings, we also lookup the pagecache
* page now as it is used to determine if a reservation has been
* consumed.
*/
- if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+ !huge_pte_write(entry)) {
if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
@@ -5679,12 +5774,32 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptl = huge_pte_lock(h, mm, ptep);
- /* Check for a racing update before calling hugetlb_cow */
+ /* Check for a racing update before calling hugetlb_wp() */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
goto out_ptl;
+ /* Handle userfault-wp first, before trying to lock more pages */
+ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
+ (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .real_address = address,
+ .flags = flags,
+ };
+
+ spin_unlock(ptl);
+ if (pagecache_page) {
+ unlock_page(pagecache_page);
+ put_page(pagecache_page);
+ }
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ return handle_userfault(&vmf, VM_UFFD_WP);
+ }
+
/*
- * hugetlb_cow() requires page locks of pte_page(entry) and
+ * hugetlb_wp() requires page locks of pte_page(entry) and
* pagecache_page, so here we need take the former one
* when page != pagecache_page or !pagecache_page.
*/
@@ -5697,13 +5812,14 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
get_page(page);
- if (flags & FAULT_FLAG_WRITE) {
+ if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, address, ptep,
- pagecache_page, ptl);
+ ret = hugetlb_wp(mm, vma, address, ptep, flags,
+ pagecache_page, ptl);
goto out_put_page;
+ } else if (likely(flags & FAULT_FLAG_WRITE)) {
+ entry = huge_pte_mkdirty(entry);
}
- entry = huge_pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
@@ -5746,7 +5862,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
unsigned long dst_addr,
unsigned long src_addr,
enum mcopy_atomic_mode mode,
- struct page **pagep)
+ struct page **pagep,
+ bool wp_copy)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct hstate *h = hstate_vma(dst_vma);
@@ -5876,27 +5993,43 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out_release_unlock;
ret = -EEXIST;
- if (!huge_pte_none(huge_ptep_get(dst_pte)))
+ /*
+ * We allow to overwrite a pte marker: consider when both MISSING|WP
+ * registered, we firstly wr-protect a none pte which has no page cache
+ * page backing it, then access the page.
+ */
+ if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
if (vm_shared) {
- page_dup_rmap(page, true);
+ page_dup_file_rmap(page, true);
} else {
ClearHPageRestoreReserve(page);
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
}
- /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
- if (is_continue && !vm_shared)
+ /*
+ * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
+ * with wp flag set, don't set pte write bit.
+ */
+ if (wp_copy || (is_continue && !vm_shared))
writable = 0;
else
writable = dst_vma->vm_flags & VM_WRITE;
_dst_pte = make_huge_pte(dst_vma, page, writable);
- if (writable)
- _dst_pte = huge_pte_mkdirty(_dst_pte);
+ /*
+ * Always mark UFFDIO_COPY page dirty; note that this may not be
+ * extremely important for hugetlbfs for now since swapping is not
+ * supported, but we should still be clear in that this page cannot be
+ * thrown away at will, even if write bit not set.
+ */
+ _dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);
+ if (wp_copy)
+ _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
+
set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
@@ -5940,6 +6073,25 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
}
}
+static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+ bool *unshare)
+{
+ pte_t pteval = huge_ptep_get(pte);
+
+ *unshare = false;
+ if (is_swap_pte(pteval))
+ return true;
+ if (huge_pte_write(pteval))
+ return false;
+ if (flags & FOLL_WRITE)
+ return true;
+ if (gup_must_unshare(flags, pte_page(pteval))) {
+ *unshare = true;
+ return true;
+ }
+ return false;
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -5954,6 +6106,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
spinlock_t *ptl = NULL;
+ bool unshare = false;
int absent;
struct page *page;
@@ -6004,9 +6157,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* both cases, and because we can't follow correct pages
* directly from any kind of swap entries.
*/
- if (absent || is_swap_pte(huge_ptep_get(pte)) ||
- ((flags & FOLL_WRITE) &&
- !huge_pte_write(huge_ptep_get(pte)))) {
+ if (absent ||
+ __follow_hugetlb_must_fault(flags, pte, &unshare)) {
vm_fault_t ret;
unsigned int fault_flags = 0;
@@ -6014,6 +6166,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
+ else if (unshare)
+ fault_flags |= FAULT_FLAG_UNSHARE;
if (locked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_KILLABLE;
@@ -6055,6 +6209,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
+ VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
+
/*
* If subpage information not requested, update counters
* and skip the same_page loop below.
@@ -6117,16 +6274,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
- unsigned long address, unsigned long end, pgprot_t newprot)
+ unsigned long address, unsigned long end,
+ pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long start = address;
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
- unsigned long pages = 0;
+ unsigned long pages = 0, psize = huge_page_size(h);
bool shared_pmd = false;
struct mmu_notifier_range range;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
/*
* In the case of shared PMDs, the area to flush could be beyond
@@ -6142,13 +6302,19 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
- for (; address < end; address += huge_page_size(h)) {
+ for (; address < end; address += psize) {
spinlock_t *ptl;
- ptep = huge_pte_offset(mm, address, huge_page_size(h));
+ ptep = huge_pte_offset(mm, address, psize);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ /*
+ * When uffd-wp is enabled on the vma, unshare
+ * shouldn't happen at all. Warn about it if it
+ * happened due to some reason.
+ */
+ WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
shared_pmd = true;
@@ -6161,20 +6327,37 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
+ struct page *page = pfn_swap_entry_to_page(entry);
- if (is_writable_migration_entry(entry)) {
+ if (!is_readable_migration_entry(entry)) {
pte_t newpte;
- entry = make_readable_migration_entry(
- swp_offset(entry));
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(
+ swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
set_huge_swap_pte_at(mm, address, ptep,
- newpte, huge_page_size(h));
+ newpte, psize);
pages++;
}
spin_unlock(ptl);
continue;
}
+ if (unlikely(pte_marker_uffd_wp(pte))) {
+ /*
+ * This is changing a non-present pte into a none pte,
+ * no need for huge_ptep_modify_prot_start/commit().
+ */
+ if (uffd_wp_resolve)
+ huge_pte_clear(mm, address, ptep, psize);
+ }
if (!huge_pte_none(pte)) {
pte_t old_pte;
unsigned int shift = huge_page_shift(hstate_vma(vma));
@@ -6182,8 +6365,18 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
pte = huge_pte_modify(old_pte, newprot);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
+ if (uffd_wp)
+ pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
+ else if (uffd_wp_resolve)
+ pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ } else {
+ /* None pte */
+ if (unlikely(uffd_wp))
+ /* Safe to modify directly (none->non-present). */
+ set_huge_pte_at(mm, address, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
}
spin_unlock(ptl);
}
@@ -6693,9 +6886,11 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
spinlock_t *ptl;
pte_t pte;
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
+ /*
+ * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+ * follow_hugetlb_page().
+ */
+ if (WARN_ON_ONCE(flags & FOLL_PIN))
return NULL;
retry:
@@ -6783,7 +6978,9 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
spin_lock_irq(&hugetlb_lock);
if (PageHeadHuge(page)) {
*hugetlb = true;
- if (HPageFreed(page) || HPageMigratable(page))
+ if (HPageFreed(page))
+ ret = 0;
+ else if (HPageMigratable(page))
ret = get_page_unless_zero(page);
else
ret = -EBUSY;
@@ -6873,6 +7070,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
if (start >= end)
return;
+ flush_cache_range(vma, start, end);
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
* we have already done the PUD_SIZE alignment.
@@ -6958,7 +7156,7 @@ void __init hugetlb_cma_reserve(int order)
if (hugetlb_cma_size_in_node[nid] == 0)
continue;
- if (!node_state(nid, N_ONLINE)) {
+ if (!node_online(nid)) {
pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
hugetlb_cma_size_in_node[nid] = 0;
@@ -6997,7 +7195,7 @@ void __init hugetlb_cma_reserve(int order)
}
reserved = 0;
- for_each_node_state(nid, N_ONLINE) {
+ for_each_online_node(nid) {
int res;
char name[CMA_MAX_NAME];