summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c299
1 files changed, 257 insertions, 42 deletions
diff --git a/mm/memory.c b/mm/memory.c
index fe2fba27ded2..ec4e15494901 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/memremap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -817,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
#else
# define HAVE_PTE_SPECIAL 0
#endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, bool with_public_device)
{
unsigned long pfn = pte_pfn(pte);
@@ -829,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return vma->vm_ops->find_special_page(vma, addr);
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
- if (!is_zero_pfn(pfn))
- print_bad_pte(vma, addr, pte, NULL);
+ if (is_zero_pfn(pfn))
+ return NULL;
+
+ /*
+ * Device public pages are special pages (they are ZONE_DEVICE
+ * pages but different from persistent memory). They behave
+ * allmost like normal pages. The difference is that they are
+ * not on the lru and thus should never be involve with any-
+ * thing that involve lru manipulation (mlock, numa balancing,
+ * ...).
+ *
+ * This is why we still want to return NULL for such page from
+ * vm_normal_page() so that we do not have to special case all
+ * call site of vm_normal_page().
+ */
+ if (likely(pfn < highest_memmap_pfn)) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (is_device_public_page(page)) {
+ if (with_public_device)
+ return page;
+ return NULL;
+ }
+ }
+ print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
@@ -956,6 +980,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = pte_swp_mksoft_dirty(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
+ } else if (is_device_private_entry(entry)) {
+ page = device_private_entry_to_page(entry);
+
+ /*
+ * Update rss count even for unaddressable pages, as
+ * they should treated just like normal pages in this
+ * respect.
+ *
+ * We will likely want to have some new rss counters
+ * for unaddressable pages, at some point. But for now
+ * keep things as they are.
+ */
+ get_page(page);
+ rss[mm_counter(page)]++;
+ page_dup_rmap(page, false);
+
+ /*
+ * We do not preserve soft-dirty information, because so
+ * far, checkpoint/restore is the only feature that
+ * requires that. And checkpoint/restore does not work
+ * when a device driver is involved (you cannot easily
+ * save and restore device driver state).
+ */
+ if (is_write_device_private_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ make_device_private_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
}
goto out_set_pte;
}
@@ -982,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
+ } else if (pte_devmap(pte)) {
+ page = pte_page(pte);
+
+ /*
+ * Cache coherent device memory behave like regular page and
+ * not like persistent memory page. For more informations see
+ * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
+ */
+ if (is_device_public_page(page)) {
+ get_page(page);
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
+ }
}
out_set_pte:
@@ -1065,7 +1131,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
+ if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
+ || pmd_devmap(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
err = copy_huge_pmd(dst_mm, src_mm,
@@ -1236,7 +1303,7 @@ again:
if (pte_present(ptent)) {
struct page *page;
- page = vm_normal_page(vma, addr, ptent);
+ page = _vm_normal_page(vma, addr, ptent, true);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
@@ -1273,6 +1340,29 @@ again:
}
continue;
}
+
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+ struct page *page = device_private_entry_to_page(entry);
+
+ if (unlikely(details && details->check_mapping)) {
+ /*
+ * unmap_shared_mapping_pages() wants to
+ * invalidate cache without truncating:
+ * unmap shared but keep private pages.
+ */
+ if (details->check_mapping !=
+ page_rmapping(page))
+ continue;
+ }
+
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ rss[mm_counter(page)]--;
+ page_remove_rmap(page, false);
+ put_page(page);
+ continue;
+ }
+
/* If details->check_mapping, we leave swap entries. */
if (unlikely(details))
continue;
@@ -1326,7 +1416,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
@@ -1513,8 +1603,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+ for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
unmap_single_vma(&tlb, vma, start, end, NULL);
+
+ /*
+ * zap_page_range does not specify whether mmap_sem should be
+ * held for read or write. That allows parallel zap_page_range
+ * operations to unmap a PTE and defer a flush meaning that
+ * this call observes pte_none and fails to flush the TLB.
+ * Rather than adding a complex API, ensure that no stale
+ * TLB entries exist when this call returns.
+ */
+ flush_tlb_range(vma, start, end);
+ }
+
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
@@ -1676,7 +1778,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
EXPORT_SYMBOL(vm_insert_page);
static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t prot)
+ pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
@@ -1688,14 +1790,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
if (!pte)
goto out;
retval = -EBUSY;
- if (!pte_none(*pte))
- goto out_unlock;
+ if (!pte_none(*pte)) {
+ if (mkwrite) {
+ /*
+ * For read faults on private mappings the PFN passed
+ * in may not match the PFN we have mapped if the
+ * mapped PFN is a writeable COW page. In the mkwrite
+ * case we are creating a writable PTE for a shared
+ * mapping and we expect the PFNs to match.
+ */
+ if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+ goto out_unlock;
+ entry = *pte;
+ goto out_mkwrite;
+ } else
+ goto out_unlock;
+ }
/* Ok, finally just insert the thing.. */
if (pfn_t_devmap(pfn))
entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
else
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+out_mkwrite:
+ if (mkwrite) {
+ entry = pte_mkyoung(entry);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ }
+
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -1766,14 +1889,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+ ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ false);
return ret;
}
EXPORT_SYMBOL(vm_insert_pfn_prot);
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
@@ -1802,10 +1926,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
page = pfn_to_page(pfn_t_to_pfn(pfn));
return insert_page(vma, addr, page, pgprot);
}
- return insert_pfn(vma, addr, pfn, pgprot);
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+}
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, false);
+
}
EXPORT_SYMBOL(vm_insert_mixed);
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, true);
+}
+EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
@@ -2571,7 +2709,7 @@ static int do_wp_page(struct vm_fault *vmf)
* not dirty accountable.
*/
if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
- int total_mapcount;
+ int total_map_swapcount;
if (!trylock_page(vmf->page)) {
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2586,8 +2724,8 @@ static int do_wp_page(struct vm_fault *vmf)
}
put_page(vmf->page);
}
- if (reuse_swap_page(vmf->page, &total_mapcount)) {
- if (total_mapcount == 1) {
+ if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
+ if (total_map_swapcount == 1) {
/*
* The page is all ours. Move it to
* our anon_vma so the rmap code will
@@ -2623,7 +2761,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}
-static inline void unmap_mapping_range_tree(struct rb_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
struct zap_details *details)
{
struct vm_area_struct *vma;
@@ -2687,7 +2825,7 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
i_mmap_lock_write(mapping);
- if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
i_mmap_unlock_write(mapping);
}
@@ -2704,22 +2842,37 @@ EXPORT_SYMBOL(unmap_mapping_range);
int do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page, *swapcache;
+ struct page *page = NULL, *swapcache;
struct mem_cgroup *memcg;
+ struct vma_swap_readahead swap_ra;
swp_entry_t entry;
pte_t pte;
int locked;
int exclusive = 0;
int ret = 0;
+ bool vma_readahead = swap_use_vma_readahead();
- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+ if (vma_readahead)
+ page = swap_readahead_detect(vmf, &swap_ra);
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
+ if (page)
+ put_page(page);
goto out;
+ }
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
+ } else if (is_device_private_entry(entry)) {
+ /*
+ * For un-addressable device memory we call the pgmap
+ * fault handler callback. The callback must migrate
+ * the page back to some CPU accessible page.
+ */
+ ret = device_private_entry_fault(vma, vmf->address, entry,
+ vmf->flags, vmf->pmd);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
@@ -2729,10 +2882,16 @@ int do_swap_page(struct vm_fault *vmf)
goto out;
}
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
- page = lookup_swap_cache(entry);
+ if (!page)
+ page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
+ vmf->address);
if (!page) {
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
+ if (vma_readahead)
+ page = do_swap_page_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+ else
+ page = swapin_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
@@ -3802,6 +3961,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
};
+ unsigned int dirty = flags & FAULT_FLAG_WRITE;
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
@@ -3824,7 +3984,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
- unsigned int dirty = flags & FAULT_FLAG_WRITE;
/* NUMA case for anonymous PUDs would go here */
@@ -3850,12 +4009,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
pmd_t orig_pmd = *vmf.pmd;
barrier();
+ if (unlikely(is_swap_pmd(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ if (is_pmd_migration_entry(orig_pmd))
+ pmd_migration_entry_wait(mm, vmf.pmd);
+ return 0;
+ }
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf, orig_pmd);
- if ((vmf.flags & FAULT_FLAG_WRITE) &&
- !pmd_write(orig_pmd)) {
+ if (dirty && !pmd_write(orig_pmd)) {
ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -3888,6 +4053,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
+ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE))
+ return VM_FAULT_SIGSEGV;
+
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
@@ -3895,11 +4065,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
- if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
- flags & FAULT_FLAG_INSTRUCTION,
- flags & FAULT_FLAG_REMOTE))
- return VM_FAULT_SIGSEGV;
-
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
@@ -4008,7 +4173,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
#endif /* __PAGETABLE_PMD_FOLDED */
static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+ unsigned long *start, unsigned long *end,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4035,17 +4201,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
if (!pmdpp)
goto out;
+ if (start && end) {
+ *start = address & PMD_MASK;
+ *end = *start + PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, *start, *end);
+ }
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*ptlp);
+ if (start && end)
+ mmu_notifier_invalidate_range_end(mm, *start, *end);
}
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
+ if (start && end) {
+ *start = address & PAGE_MASK;
+ *end = *start + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, *start, *end);
+ }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -4053,6 +4231,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
+ if (start && end)
+ mmu_notifier_invalidate_range_end(mm, *start, *end);
out:
return -EINVAL;
}
@@ -4064,20 +4244,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
- ptlp)));
+ !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+ ptepp, NULL, ptlp)));
return res;
}
int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ unsigned long *start, unsigned long *end,
pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
int res;
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
- ptlp)));
+ !(res = __follow_pte_pmd(mm, address, start, end,
+ ptepp, pmdpp, ptlp)));
return res;
}
EXPORT_SYMBOL(follow_pte_pmd);
@@ -4340,19 +4521,53 @@ static void clear_gigantic_page(struct page *page,
}
}
void clear_huge_page(struct page *page,
- unsigned long addr, unsigned int pages_per_huge_page)
+ unsigned long addr_hint, unsigned int pages_per_huge_page)
{
- int i;
+ int i, n, base, l;
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, pages_per_huge_page);
return;
}
+ /* Clear sub-page to access last to keep its cache lines hot */
might_sleep();
- for (i = 0; i < pages_per_huge_page; i++) {
+ n = (addr_hint - addr) / PAGE_SIZE;
+ if (2 * n <= pages_per_huge_page) {
+ /* If sub-page to access in first half of huge page */
+ base = 0;
+ l = n;
+ /* Clear sub-pages at the end of huge page */
+ for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+ } else {
+ /* If sub-page to access in second half of huge page */
+ base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
+ l = pages_per_huge_page - n;
+ /* Clear sub-pages at the begin of huge page */
+ for (i = 0; i < base; i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+ }
+ /*
+ * Clear remaining sub-pages in left-right-left-right pattern
+ * towards the sub-page to access
+ */
+ for (i = 0; i < l; i++) {
+ int left_idx = base + i;
+ int right_idx = base + 2 * l - 1 - i;
+
+ cond_resched();
+ clear_user_highpage(page + left_idx,
+ addr + left_idx * PAGE_SIZE);
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ clear_user_highpage(page + right_idx,
+ addr + right_idx * PAGE_SIZE);
}
}