diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 178 |
1 files changed, 135 insertions, 43 deletions
diff --git a/mm/memory.c b/mm/memory.c index 4126dd16778c..f46ac18ba231 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1310,8 +1310,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; + ret = handle_mm_fault(mm, vma, start, - foll_flags & FOLL_WRITE); + (foll_flags & FOLL_WRITE) ? + FAULT_FLAG_WRITE : 0); + if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; @@ -1360,6 +1363,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i; } +/** + * get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @len: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force write access even if user mapping is + * readonly. This will result in the page being COWed even + * in MAP_SHARED mappings. You do not want this. + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If len is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -2446,7 +2499,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; struct page *page; @@ -2466,7 +2519,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - grab_swap_token(); /* Contend for token _before_ read-in */ + grab_swap_token(mm); /* Contend for token _before_ read-in */ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { @@ -2522,9 +2575,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); - if (write_access && reuse_swap_page(page)) { + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - write_access = 0; + flags &= ~FAULT_FLAG_WRITE; } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); @@ -2537,7 +2590,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, try_to_free_swap(page); unlock_page(page); - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; @@ -2566,7 +2619,7 @@ out_page: */ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) + unsigned int flags) { struct page *page; spinlock_t *ptl; @@ -2726,7 +2779,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * due to the bad i386 page protection. But it's valid * for other architectures too. * - * Note that if write_access is true, we either now have + * Note that if FAULT_FLAG_WRITE is set, we either now have * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. @@ -2797,11 +2850,10 @@ unwritable_page: static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { pgoff_t pgoff = (((address & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); pte_unmap(page_table); return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); @@ -2818,12 +2870,12 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) + unsigned int flags, pte_t orig_pte) { - unsigned int flags = FAULT_FLAG_NONLINEAR | - (write_access ? FAULT_FLAG_WRITE : 0); pgoff_t pgoff; + flags |= FAULT_FLAG_NONLINEAR; + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return 0; @@ -2854,7 +2906,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, int write_access) + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -2865,30 +2917,30 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); } return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); + pte, pmd, flags); } if (pte_file(entry)) return do_nonlinear_fault(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); return do_swap_page(mm, vma, address, - pte, pmd, write_access, entry); + pte, pmd, flags, entry); } ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, pmd, ptl, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { + if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { update_mmu_cache(vma, address, entry); } else { /* @@ -2897,7 +2949,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (write_access) + if (flags & FAULT_FLAG_WRITE) flush_tlb_page(vma, address); } unlock: @@ -2909,7 +2961,7 @@ unlock: * By the time we get here, we already hold the mm semaphore */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) + unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; @@ -2921,7 +2973,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGFAULT); if (unlikely(is_vm_hugetlb_page(vma))) - return hugetlb_fault(mm, vma, address, write_access); + return hugetlb_fault(mm, vma, address, flags); pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); @@ -2934,7 +2986,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) return VM_FAULT_OOM; - return handle_pte_fault(mm, vma, address, pte, pmd, write_access); + return handle_pte_fault(mm, vma, address, pte, pmd, flags); } #ifndef __PAGETABLE_PUD_FOLDED @@ -3053,22 +3105,13 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ -#ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot, resource_size_t *phys) +static int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - resource_size_t phys_addr = 0; - struct mm_struct *mm = vma->vm_mm; - int ret = -EINVAL; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - goto out; + pte_t *ptep; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) @@ -3086,22 +3129,71 @@ int follow_phys(struct vm_area_struct *vma, if (pmd_huge(*pmd)) goto out; - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!ptep) goto out; + if (!pte_present(*ptep)) + goto unlock; + *ptepp = ptep; + return 0; +unlock: + pte_unmap_unlock(ptep, *ptlp); +out: + return -EINVAL; +} +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + int ret = -EINVAL; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return ret; + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + if (ret) + return ret; + *pfn = pte_pfn(*ptep); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(follow_pfn); + +#ifdef CONFIG_HAVE_IOREMAP_PROT +int follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot, resource_size_t *phys) +{ + int ret = -EINVAL; + pte_t *ptep, pte; + spinlock_t *ptl; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + goto out; pte = *ptep; - if (!pte_present(pte)) - goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; - phys_addr = pte_pfn(pte); - phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ *prot = pgprot_val(pte_pgprot(pte)); - *phys = phys_addr; - ret = 0; + *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + ret = 0; unlock: pte_unmap_unlock(ptep, ptl); out: |