diff options
author | Dan Williams <dan.j.williams@intel.com> | 2016-01-16 01:56:55 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-16 02:56:32 +0100 |
commit | 3565fce3a6597e91b8dee3e8e36ebf70f8b7ef9b (patch) | |
tree | 54f05861c87cb2c2710552b61a46cb5831b06296 /mm | |
parent | mm, dax: dax-pmd vs thp-pmd vs hugetlbfs-pmd (diff) | |
download | linux-3565fce3a6597e91b8dee3e8e36ebf70f8b7ef9b.tar.xz linux-3565fce3a6597e91b8dee3e8e36ebf70f8b7ef9b.zip |
mm, x86: get_user_pages() for dax mappings
A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver
has established a devm_memremap_pages() mapping, i.e. when the pfn_t
return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when
encountering _PAGE_DEVMAP during a page table walk we lookup and pin a
struct dev_pagemap instance to keep the result of pfn_to_page() valid
until put_page().
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/gup.c | 30 | ||||
-rw-r--r-- | mm/huge_memory.c | 75 | ||||
-rw-r--r-- | mm/swap.c | 1 |
3 files changed, 89 insertions, 17 deletions
@@ -4,6 +4,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> +#include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -98,7 +100,17 @@ retry: } page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { + if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { + /* + * Only return device mapping pages in the FOLL_GET case since + * they are only valid while holding the pgmap reference. + */ + pgmap = get_dev_pagemap(pte_pfn(pte), NULL); + if (pgmap) + page = pte_page(pte); + else + goto no_page; + } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); @@ -129,8 +141,15 @@ retry: goto retry; } - if (flags & FOLL_GET) + if (flags & FOLL_GET) { get_page(page); + + /* drop the pgmap reference now that we hold the page */ + if (pgmap) { + put_dev_pagemap(pgmap); + pgmap = NULL; + } + } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -237,6 +256,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, } if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); + if (pmd_devmap(*pmd)) { + ptl = pmd_lock(mm, pmd); + page = follow_devmap_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + if (page) + return page; + } if (likely(!pmd_trans_huge(*pmd))) return follow_page_pte(vma, address, pmd, flags); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 82bed2bec3ed..b2db98136af9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -23,6 +23,7 @@ #include <linux/freezer.h> #include <linux/pfn_t.h> #include <linux/mman.h> +#include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/migrate.h> @@ -974,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_NOPAGE; } +static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + pmd_t _pmd; + + /* + * We should set the dirty bit only for FOLL_WRITE but for now + * the dirty bit in the pmd is meaningless. And if the dirty + * bit will become meaningful and we'll only set it with + * FOLL_WRITE, an atomic set_bit will be required on the pmd to + * set the young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); +} + +struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, int flags) +{ + unsigned long pfn = pmd_pfn(*pmd); + struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap; + struct page *page; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + return NULL; + + if (pmd_present(*pmd) && pmd_devmap(*pmd)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + */ + if (!(flags & FOLL_GET)) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + get_page(page); + put_dev_pagemap(pgmap); + + return page; +} + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) @@ -1331,21 +1389,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page), page); - if (flags & FOLL_TOUCH) { - pmd_t _pmd; - /* - * We should set the dirty bit only for FOLL_WRITE but - * for now the dirty bit in the pmd is meaningless. - * And if the dirty bit will become meaningful and - * we'll only set it with FOLL_WRITE, an atomic - * set_bit will be required on the pmd to set the - * young bit, instead of the current set_pmd_at. - */ - _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, 1)) - update_mmu_cache_pmd(vma, addr, pmd); - } + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * We don't mlock() pte-mapped THPs. This way we can avoid diff --git a/mm/swap.c b/mm/swap.c index 674e2c93da4e..09fe5e97714a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> +#include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> |