diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 299 |
1 files changed, 257 insertions, 42 deletions
diff --git a/mm/memory.c b/mm/memory.c index fe2fba27ded2..ec4e15494901 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -49,6 +49,7 @@ #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/memremap.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> @@ -817,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, #else # define HAVE_PTE_SPECIAL 0 #endif -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); @@ -829,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; - if (!is_zero_pfn(pfn)) - print_bad_pte(vma, addr, pte, NULL); + if (is_zero_pfn(pfn)) + return NULL; + + /* + * Device public pages are special pages (they are ZONE_DEVICE + * pages but different from persistent memory). They behave + * allmost like normal pages. The difference is that they are + * not on the lru and thus should never be involve with any- + * thing that involve lru manipulation (mlock, numa balancing, + * ...). + * + * This is why we still want to return NULL for such page from + * vm_normal_page() so that we do not have to special case all + * call site of vm_normal_page(). + */ + if (likely(pfn < highest_memmap_pfn)) { + struct page *page = pfn_to_page(pfn); + + if (is_device_public_page(page)) { + if (with_public_device) + return page; + return NULL; + } + } + print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -956,6 +980,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } + } else if (is_device_private_entry(entry)) { + page = device_private_entry_to_page(entry); + + /* + * Update rss count even for unaddressable pages, as + * they should treated just like normal pages in this + * respect. + * + * We will likely want to have some new rss counters + * for unaddressable pages, at some point. But for now + * keep things as they are. + */ + get_page(page); + rss[mm_counter(page)]++; + page_dup_rmap(page, false); + + /* + * We do not preserve soft-dirty information, because so + * far, checkpoint/restore is the only feature that + * requires that. And checkpoint/restore does not work + * when a device driver is involved (you cannot easily + * save and restore device driver state). + */ + if (is_write_device_private_entry(entry) && + is_cow_mapping(vm_flags)) { + make_device_private_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } goto out_set_pte; } @@ -982,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; + } else if (pte_devmap(pte)) { + page = pte_page(pte); + + /* + * Cache coherent device memory behave like regular page and + * not like persistent memory page. For more informations see + * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h + */ + if (is_device_public_page(page)) { + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } } out_set_pte: @@ -1065,7 +1131,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) + || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, @@ -1236,7 +1303,7 @@ again: if (pte_present(ptent)) { struct page *page; - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -1273,6 +1340,29 @@ again: } continue; } + + entry = pte_to_swp_entry(ptent); + if (non_swap_entry(entry) && is_device_private_entry(entry)) { + struct page *page = device_private_entry_to_page(entry); + + if (unlikely(details && details->check_mapping)) { + /* + * unmap_shared_mapping_pages() wants to + * invalidate cache without truncating: + * unmap shared but keep private pages. + */ + if (details->check_mapping != + page_rmapping(page)) + continue; + } + + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + rss[mm_counter(page)]--; + page_remove_rmap(page, false); + put_page(page); + continue; + } + /* If details->check_mapping, we leave swap entries. */ if (unlikely(details)) continue; @@ -1326,7 +1416,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); @@ -1513,8 +1603,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { unmap_single_vma(&tlb, vma, start, end, NULL); + + /* + * zap_page_range does not specify whether mmap_sem should be + * held for read or write. That allows parallel zap_page_range + * operations to unmap a PTE and defer a flush meaning that + * this call observes pte_none and fails to flush the TLB. + * Rather than adding a complex API, ensure that no stale + * TLB entries exist when this call returns. + */ + flush_tlb_range(vma, start, end); + } + mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } @@ -1676,7 +1778,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, EXPORT_SYMBOL(vm_insert_page); static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t prot) + pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; int retval; @@ -1688,14 +1790,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (!pte) goto out; retval = -EBUSY; - if (!pte_none(*pte)) - goto out_unlock; + if (!pte_none(*pte)) { + if (mkwrite) { + /* + * For read faults on private mappings the PFN passed + * in may not match the PFN we have mapped if the + * mapped PFN is a writeable COW page. In the mkwrite + * case we are creating a writable PTE for a shared + * mapping and we expect the PFNs to match. + */ + if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) + goto out_unlock; + entry = *pte; + goto out_mkwrite; + } else + goto out_unlock; + } /* Ok, finally just insert the thing.. */ if (pfn_t_devmap(pfn)) entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); + +out_mkwrite: + if (mkwrite) { + entry = pte_mkyoung(entry); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + } + set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ @@ -1766,14 +1889,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); - ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); + ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + false); return ret; } EXPORT_SYMBOL(vm_insert_pfn_prot); -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; @@ -1802,10 +1926,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, page = pfn_to_page(pfn_t_to_pfn(pfn)); return insert_page(vma, addr, page, pgprot); } - return insert_pfn(vma, addr, pfn, pgprot); + return insert_pfn(vma, addr, pfn, pgprot, mkwrite); +} + +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, false); + } EXPORT_SYMBOL(vm_insert_mixed); +int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, true); +} +EXPORT_SYMBOL(vm_insert_mixed_mkwrite); + /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results @@ -2571,7 +2709,7 @@ static int do_wp_page(struct vm_fault *vmf) * not dirty accountable. */ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { - int total_mapcount; + int total_map_swapcount; if (!trylock_page(vmf->page)) { get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2586,8 +2724,8 @@ static int do_wp_page(struct vm_fault *vmf) } put_page(vmf->page); } - if (reuse_swap_page(vmf->page, &total_mapcount)) { - if (total_mapcount == 1) { + if (reuse_swap_page(vmf->page, &total_map_swapcount)) { + if (total_map_swapcount == 1) { /* * The page is all ours. Move it to * our anon_vma so the rmap code will @@ -2623,7 +2761,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct rb_root *root, +static inline void unmap_mapping_range_tree(struct rb_root_cached *root, struct zap_details *details) { struct vm_area_struct *vma; @@ -2687,7 +2825,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; i_mmap_lock_write(mapping); - if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, &details); i_mmap_unlock_write(mapping); } @@ -2704,22 +2842,37 @@ EXPORT_SYMBOL(unmap_mapping_range); int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page, *swapcache; + struct page *page = NULL, *swapcache; struct mem_cgroup *memcg; + struct vma_swap_readahead swap_ra; swp_entry_t entry; pte_t pte; int locked; int exclusive = 0; int ret = 0; + bool vma_readahead = swap_use_vma_readahead(); - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) + if (vma_readahead) + page = swap_readahead_detect(vmf, &swap_ra); + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { + if (page) + put_page(page); goto out; + } entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); + } else if (is_device_private_entry(entry)) { + /* + * For un-addressable device memory we call the pgmap + * fault handler callback. The callback must migrate + * the page back to some CPU accessible page. + */ + ret = device_private_entry_fault(vma, vmf->address, entry, + vmf->flags, vmf->pmd); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { @@ -2729,10 +2882,16 @@ int do_swap_page(struct vm_fault *vmf) goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); - page = lookup_swap_cache(entry); + if (!page) + page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, + vmf->address); if (!page) { - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, - vmf->address); + if (vma_readahead) + page = do_swap_page_readahead(entry, + GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); + else + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte @@ -3802,6 +3961,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), }; + unsigned int dirty = flags & FAULT_FLAG_WRITE; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; @@ -3824,7 +3984,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, barrier(); if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { - unsigned int dirty = flags & FAULT_FLAG_WRITE; /* NUMA case for anonymous PUDs would go here */ @@ -3850,12 +4009,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pmd_t orig_pmd = *vmf.pmd; barrier(); + if (unlikely(is_swap_pmd(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + if (is_pmd_migration_entry(orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); - if ((vmf.flags & FAULT_FLAG_WRITE) && - !pmd_write(orig_pmd)) { + if (dirty && !pmd_write(orig_pmd)) { ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -3888,6 +4053,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, + flags & FAULT_FLAG_REMOTE)) + return VM_FAULT_SIGSEGV; + /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. @@ -3895,11 +4065,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (flags & FAULT_FLAG_USER) mem_cgroup_oom_enable(); - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, - flags & FAULT_FLAG_INSTRUCTION, - flags & FAULT_FLAG_REMOTE)) - return VM_FAULT_SIGSEGV; - if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else @@ -4008,7 +4173,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) #endif /* __PAGETABLE_PMD_FOLDED */ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) + unsigned long *start, unsigned long *end, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; p4d_t *p4d; @@ -4035,17 +4201,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, if (!pmdpp) goto out; + if (start && end) { + *start = address & PMD_MASK; + *end = *start + PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, *start, *end); + } *ptlp = pmd_lock(mm, pmd); if (pmd_huge(*pmd)) { *pmdpp = pmd; return 0; } spin_unlock(*ptlp); + if (start && end) + mmu_notifier_invalidate_range_end(mm, *start, *end); } if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; + if (start && end) { + *start = address & PAGE_MASK; + *end = *start + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, *start, *end); + } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) goto unlock; @@ -4053,6 +4231,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, return 0; unlock: pte_unmap_unlock(ptep, *ptlp); + if (start && end) + mmu_notifier_invalidate_range_end(mm, *start, *end); out: return -EINVAL; } @@ -4064,20 +4244,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, ptepp, NULL, - ptlp))); + !(res = __follow_pte_pmd(mm, address, NULL, NULL, + ptepp, NULL, ptlp))); return res; } int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { int res; /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp, - ptlp))); + !(res = __follow_pte_pmd(mm, address, start, end, + ptepp, pmdpp, ptlp))); return res; } EXPORT_SYMBOL(follow_pte_pmd); @@ -4340,19 +4521,53 @@ static void clear_gigantic_page(struct page *page, } } void clear_huge_page(struct page *page, - unsigned long addr, unsigned int pages_per_huge_page) + unsigned long addr_hint, unsigned int pages_per_huge_page) { - int i; + int i, n, base, l; + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, pages_per_huge_page); return; } + /* Clear sub-page to access last to keep its cache lines hot */ might_sleep(); - for (i = 0; i < pages_per_huge_page; i++) { + n = (addr_hint - addr) / PAGE_SIZE; + if (2 * n <= pages_per_huge_page) { + /* If sub-page to access in first half of huge page */ + base = 0; + l = n; + /* Clear sub-pages at the end of huge page */ + for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } + } else { + /* If sub-page to access in second half of huge page */ + base = pages_per_huge_page - 2 * (pages_per_huge_page - n); + l = pages_per_huge_page - n; + /* Clear sub-pages at the begin of huge page */ + for (i = 0; i < base; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } + } + /* + * Clear remaining sub-pages in left-right-left-right pattern + * towards the sub-page to access + */ + for (i = 0; i < l; i++) { + int left_idx = base + i; + int right_idx = base + 2 * l - 1 - i; + + cond_resched(); + clear_user_highpage(page + left_idx, + addr + left_idx * PAGE_SIZE); cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + clear_user_highpage(page + right_idx, + addr + right_idx * PAGE_SIZE); } } |