diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 194 |
1 files changed, 141 insertions, 53 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e52df7c471b..b21d78c941b5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -11,6 +11,9 @@ #include <linux/highmem.h> #include <linux/nodemask.h> #include <linux/pagemap.h> +#include <linux/mempolicy.h> +#include <linux/cpuset.h> + #include <asm/page.h> #include <asm/pgtable.h> @@ -36,18 +39,22 @@ static void enqueue_huge_page(struct page *page) free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(void) +static struct page *dequeue_huge_page(struct vm_area_struct *vma, + unsigned long address) { int nid = numa_node_id(); struct page *page = NULL; + struct zonelist *zonelist = huge_zonelist(vma, address); + struct zone **z; - if (list_empty(&hugepage_freelists[nid])) { - for (nid = 0; nid < MAX_NUMNODES; ++nid) - if (!list_empty(&hugepage_freelists[nid])) - break; + for (z = zonelist->zones; *z; z++) { + nid = (*z)->zone_pgdat->node_id; + if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && + !list_empty(&hugepage_freelists[nid])) + break; } - if (nid >= 0 && nid < MAX_NUMNODES && - !list_empty(&hugepage_freelists[nid])) { + + if (*z) { page = list_entry(hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); @@ -85,13 +92,13 @@ void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); } -struct page *alloc_huge_page(void) +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { struct page *page; int i; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(); + page = dequeue_huge_page(vma, addr); if (!page) { spin_unlock(&hugetlb_lock); return NULL; @@ -194,7 +201,7 @@ static unsigned long set_max_huge_pages(unsigned long count) spin_lock(&hugetlb_lock); try_to_free_low(count); while (count < nr_huge_pages) { - struct page *page = dequeue_huge_page(); + struct page *page = dequeue_huge_page(NULL, 0); if (!page) break; update_and_free_page(page); @@ -261,11 +268,12 @@ struct vm_operations_struct hugetlb_vm_ops = { .nopage = hugetlb_nopage, }; -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, + int writable) { pte_t entry; - if (vma->vm_flags & VM_WRITE) { + if (writable) { entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); } else { @@ -277,12 +285,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) return entry; } +static void set_huge_ptep_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t entry; + + entry = pte_mkwrite(pte_mkdirty(*ptep)); + ptep_set_access_flags(vma, address, ptep, entry, 1); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); +} + + int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pte_t *src_pte, *dst_pte, entry; struct page *ptepage; unsigned long addr; + int cow; + + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { src_pte = huge_pte_offset(src, addr); @@ -294,6 +317,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_lock(&dst->page_table_lock); spin_lock(&src->page_table_lock); if (!pte_none(*src_pte)) { + if (cow) + ptep_set_wrprotect(src, addr, src_pte); entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); @@ -345,57 +370,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -static struct page *find_lock_huge_page(struct address_space *mapping, - unsigned long idx) +static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, pte_t pte) { - struct page *page; - int err; - struct inode *inode = mapping->host; - unsigned long size; + struct page *old_page, *new_page; + int i, avoidcopy; -retry: - page = find_lock_page(mapping, idx); - if (page) - goto out; + old_page = pte_page(pte); - /* Check to make sure the mapping hasn't been truncated */ - size = i_size_read(inode) >> HPAGE_SHIFT; - if (idx >= size) - goto out; + /* If no-one else is actually using this page, avoid the copy + * and just make the page writable */ + avoidcopy = (page_count(old_page) == 1); + if (avoidcopy) { + set_huge_ptep_writable(vma, address, ptep); + return VM_FAULT_MINOR; + } - if (hugetlb_get_quota(mapping)) - goto out; - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - goto out; + page_cache_get(old_page); + new_page = alloc_huge_page(vma, address); + + if (!new_page) { + page_cache_release(old_page); + + /* Logically this is OOM, not a SIGBUS, but an OOM + * could cause the kernel to go killing other + * processes which won't help the hugepage situation + * at all (?) */ + return VM_FAULT_SIGBUS; } - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); - if (err) { - put_page(page); - hugetlb_put_quota(mapping); - if (err == -EEXIST) - goto retry; - page = NULL; + spin_unlock(&mm->page_table_lock); + for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) + copy_user_highpage(new_page + i, old_page + i, + address + i*PAGE_SIZE); + spin_lock(&mm->page_table_lock); + + ptep = huge_pte_offset(mm, address & HPAGE_MASK); + if (likely(pte_same(*ptep, pte))) { + /* Break COW */ + set_huge_pte_at(mm, address, ptep, + make_huge_pte(vma, new_page, 1)); + /* Make the old page be freed below */ + new_page = old_page; } -out: - return page; + page_cache_release(new_page); + page_cache_release(old_page); + return VM_FAULT_MINOR; } -int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) +int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, int write_access) { int ret = VM_FAULT_SIGBUS; unsigned long idx; unsigned long size; - pte_t *pte; struct page *page; struct address_space *mapping; - - pte = huge_pte_alloc(mm, address); - if (!pte) - goto out; + pte_t new_pte; mapping = vma->vm_file->f_mapping; idx = ((address - vma->vm_start) >> HPAGE_SHIFT) @@ -405,9 +436,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_lock_huge_page(mapping, idx); - if (!page) - goto out; +retry: + page = find_lock_page(mapping, idx); + if (!page) { + if (hugetlb_get_quota(mapping)) + goto out; + page = alloc_huge_page(vma, address); + if (!page) { + hugetlb_put_quota(mapping); + goto out; + } + + if (vma->vm_flags & VM_SHARED) { + int err; + + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + if (err) { + put_page(page); + hugetlb_put_quota(mapping); + if (err == -EEXIST) + goto retry; + goto out; + } + } else + lock_page(page); + } spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> HPAGE_SHIFT; @@ -415,11 +468,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto backout; ret = VM_FAULT_MINOR; - if (!pte_none(*pte)) + if (!pte_none(*ptep)) goto backout; add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); - set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, address, ptep, new_pte); + + if (write_access && !(vma->vm_flags & VM_SHARED)) { + /* Optimization, do the COW without a second fault */ + ret = hugetlb_cow(mm, vma, address, ptep, new_pte); + } + spin_unlock(&mm->page_table_lock); unlock_page(page); out: @@ -433,6 +494,33 @@ backout: goto out; } +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access) +{ + pte_t *ptep; + pte_t entry; + int ret; + + ptep = huge_pte_alloc(mm, address); + if (!ptep) + return VM_FAULT_OOM; + + entry = *ptep; + if (pte_none(entry)) + return hugetlb_no_page(mm, vma, address, ptep, write_access); + + ret = VM_FAULT_MINOR; + + spin_lock(&mm->page_table_lock); + /* Check for a racing update before calling hugetlb_cow */ + if (likely(pte_same(entry, *ptep))) + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry); + spin_unlock(&mm->page_table_lock); + + return ret; +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i) |