diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 158 |
1 files changed, 97 insertions, 61 deletions
diff --git a/mm/memory.c b/mm/memory.c index 6953d3926e01..a56e3ba816b2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return 0; + batch = tlb->active; } VM_BUG_ON(batch->nr > batch->max); @@ -1112,11 +1113,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, int force_flush = 0; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; + pte_t *start_pte; pte_t *pte; again: init_rss_vec(rss); - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = start_pte; arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; @@ -1196,7 +1199,7 @@ again: add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(pte - 1, ptl); + pte_unmap_unlock(start_pte, ptl); /* * mmu_gather ran out of room to batch pages, we break out of @@ -1287,16 +1290,9 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, return addr; } -#ifdef CONFIG_PREEMPT -# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) -#else -/* No preempt: go for improved straight-line efficiency */ -# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) -#endif - /** * unmap_vmas - unmap a range of memory covered by a list of vma's - * @tlbp: address of the caller's struct mmu_gather + * @tlb: address of the caller's struct mmu_gather * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -1307,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, * * Unmap all pages in the vma list. * - * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to - * return the ending mmu_gather to the caller. - * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. @@ -1813,7 +1805,63 @@ next_page: } EXPORT_SYMBOL(__get_user_pages); -/** +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* * get_user_pages() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -2796,30 +2844,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) -{ - struct address_space *mapping = inode->i_mapping; - - /* - * If the underlying filesystem is not going to provide - * a way to truncate a range of blocks (punch a hole) - - * we should return failure right now. - */ - if (!inode->i_op->truncate_range) - return -ENOSYS; - - mutex_lock(&inode->i_mutex); - down_write(&inode->i_alloc_sem); - unmap_mapping_range(mapping, offset, (end - offset), 1); - truncate_inode_pages_range(mapping, offset, end); - unmap_mapping_range(mapping, offset, (end - offset), 1); - inode->i_op->truncate_range(inode, offset, end); - up_write(&inode->i_alloc_sem); - mutex_unlock(&inode->i_mutex); - - return 0; -} - /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3125,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *page_table; spinlock_t *ptl; struct page *page; + struct page *cow_page; pte_t entry; int anon = 0; - int charged = 0; struct page *dirty_page = NULL; struct vm_fault vmf; int ret; int page_mkwrite = 0; + /* + * If we do COW later, allocate page befor taking lock_page() + * on the file cache page. This will reduce lock holding time. + */ + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!cow_page) + return VM_FAULT_OOM; + + if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { + page_cache_release(cow_page); + return VM_FAULT_OOM; + } + } else + cow_page = NULL; + vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; vmf.flags = flags; @@ -3141,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; + goto uncharge_out; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - return VM_FAULT_HWPOISON; + ret = VM_FAULT_HWPOISON; + goto uncharge_out; } /* @@ -3164,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, page = vmf.page; if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { + page = cow_page; anon = 1; - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; - goto out; - } - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); - if (!page) { - ret = VM_FAULT_OOM; - goto out; - } - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { - ret = VM_FAULT_OOM; - page_cache_release(page); - goto out; - } - charged = 1; copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -3249,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, page_table); } else { - if (charged) - mem_cgroup_uncharge_page(page); + if (cow_page) + mem_cgroup_uncharge_page(cow_page); if (anon) page_cache_release(page); else @@ -3259,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); -out: if (dirty_page) { struct address_space *mapping = page->mapping; @@ -3289,6 +3318,13 @@ out: unwritable_page: page_cache_release(page); return ret; +uncharge_out: + /* fs's fault handler get error */ + if (cow_page) { + mem_cgroup_uncharge_page(cow_page); + page_cache_release(cow_page); + } + return ret; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |