diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/bootmem.c | 60 | ||||
-rw-r--r-- | mm/filemap.c | 78 | ||||
-rw-r--r-- | mm/hugetlb.c | 192 | ||||
-rw-r--r-- | mm/internal.h | 21 | ||||
-rw-r--r-- | mm/madvise.c | 35 | ||||
-rw-r--r-- | mm/memory.c | 104 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 3 | ||||
-rw-r--r-- | mm/mempolicy.c | 106 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 345 | ||||
-rw-r--r-- | mm/readahead.c | 15 | ||||
-rw-r--r-- | mm/rmap.c | 57 | ||||
-rw-r--r-- | mm/shmem.c | 36 | ||||
-rw-r--r-- | mm/swap.c | 27 | ||||
-rw-r--r-- | mm/swap_state.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 20 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 29 | ||||
-rw-r--r-- | mm/truncate.c | 44 | ||||
-rw-r--r-- | mm/vmscan.c | 125 |
23 files changed, 771 insertions, 553 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 21eb51d4da8f..b3db11f137e0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -11,7 +11,7 @@ choice config FLATMEM_MANUAL bool "Flat Memory" - depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE + depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE help This option allows you to change some of the ways that Linux manages its memory internally. Most users will diff --git a/mm/bootmem.c b/mm/bootmem.c index e8c567177dcf..35c32290f717 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -204,6 +204,8 @@ restart_scan: unsigned long j; i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); i = ALIGN(i, incr); + if (i >= eidx) + break; if (test_bit(i, bdata->node_bootmem_map)) continue; for (j = i + 1; j < i + areasize; ++j) { @@ -294,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) unsigned long v = ~map[i / BITS_PER_LONG]; if (gofast && v == ~0UL) { - int j, order; + int order; page = pfn_to_page(pfn); count += BITS_PER_LONG; - __ClearPageReserved(page); order = ffs(BITS_PER_LONG) - 1; - set_page_refs(page, order); - for (j = 1; j < BITS_PER_LONG; j++) { - if (j + 16 < BITS_PER_LONG) - prefetchw(page + j + 16); - __ClearPageReserved(page + j); - set_page_count(page + j, 0); - } - __free_pages(page, order); + __free_pages_bootmem(page, order); i += BITS_PER_LONG; page += BITS_PER_LONG; } else if (v) { @@ -317,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) for (m = 1; m && i < idx; m<<=1, page++, i++) { if (v & m) { count++; - __ClearPageReserved(page); - set_page_refs(page, 0); - __free_page(page); + __free_pages_bootmem(page, 0); } } } else { @@ -337,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) count = 0; for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { count++; - __ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); + __free_pages_bootmem(page, 0); } total += count; bdata->node_bootmem_map = NULL; @@ -391,15 +381,14 @@ unsigned long __init free_all_bootmem (void) return(free_all_bootmem_core(NODE_DATA(0))); } -void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit) +void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { pg_data_t *pgdat = pgdat_list; void *ptr; for_each_pgdat(pgdat) if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, - align, goal, limit))) + align, goal, 0))) return(ptr); /* @@ -411,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un } -void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, + unsigned long goal) { void *ptr; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit); + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); if (ptr) return (ptr); - return __alloc_bootmem_limit(size, align, goal, limit); + return __alloc_bootmem(size, align, goal); +} + +#define LOW32LIMIT 0xffffffff + +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) +{ + pg_data_t *pgdat = pgdat_list; + void *ptr; + + for_each_pgdat(pgdat) + if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, + align, goal, LOW32LIMIT))) + return(ptr); + + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size); + panic("Out of low memory"); + return NULL; } +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); +} diff --git a/mm/filemap.c b/mm/filemap.c index 33a28bfde158..4ef24a397684 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -555,11 +555,12 @@ repeat: page_cache_get(page); if (TestSetPageLocked(page)) { read_unlock_irq(&mapping->tree_lock); - lock_page(page); + __lock_page(page); read_lock_irq(&mapping->tree_lock); /* Has the page been truncated while we slept? */ - if (page->mapping != mapping || page->index != offset) { + if (unlikely(page->mapping != mapping || + page->index != offset)) { unlock_page(page); page_cache_release(page); goto repeat; @@ -831,8 +832,13 @@ readpage: /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); - if (unlikely(error)) + if (unlikely(error)) { + if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } goto readpage_error; + } if (!PageUptodate(page)) { lock_page(page); @@ -1152,26 +1158,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) { struct address_space *mapping = file->f_mapping; struct page *page; - int error; + int ret; - page = page_cache_alloc_cold(mapping); - if (!page) - return -ENOMEM; + do { + page = page_cache_alloc_cold(mapping); + if (!page) + return -ENOMEM; + + ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + if (ret == 0) + ret = mapping->a_ops->readpage(file, page); + else if (ret == -EEXIST) + ret = 0; /* losing race to add is OK */ - error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); - if (!error) { - error = mapping->a_ops->readpage(file, page); page_cache_release(page); - return error; - } - /* - * We arrive here in the unlikely event that someone - * raced with us and added our page to the cache first - * or we are out of memory for radix-tree nodes. - */ - page_cache_release(page); - return error == -EEXIST ? 0 : error; + } while (ret == AOP_TRUNCATED_PAGE); + + return ret; } #define MMAP_LOTSAMISS (100) @@ -1331,10 +1335,14 @@ page_not_uptodate: goto success; } - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1358,10 +1366,14 @@ page_not_uptodate: goto success; } ClearPageError(page); - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1444,10 +1456,14 @@ page_not_uptodate: goto success; } - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1470,10 +1486,14 @@ page_not_uptodate: } ClearPageError(page); - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1934,12 +1954,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { loff_t isize = i_size_read(inode); + + if (status != AOP_TRUNCATED_PAGE) + unlock_page(page); + page_cache_release(page); + if (status == AOP_TRUNCATED_PAGE) + continue; /* * prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. */ - unlock_page(page); - page_cache_release(page); if (pos + bytes > isize) vmtruncate(inode, isize); break; @@ -1952,6 +1976,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, cur_iov, iov_base, bytes); flush_dcache_page(page); status = a_ops->commit_write(file, page, offset, offset+bytes); + if (status == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + continue; + } if (likely(copied > 0)) { if (!status) status = copied; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e52df7c471b..f4c43d7980ba 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -11,6 +11,8 @@ #include <linux/highmem.h> #include <linux/nodemask.h> #include <linux/pagemap.h> +#include <linux/mempolicy.h> + #include <asm/page.h> #include <asm/pgtable.h> @@ -36,18 +38,21 @@ static void enqueue_huge_page(struct page *page) free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(void) +static struct page *dequeue_huge_page(struct vm_area_struct *vma, + unsigned long address) { int nid = numa_node_id(); struct page *page = NULL; + struct zonelist *zonelist = huge_zonelist(vma, address); + struct zone **z; - if (list_empty(&hugepage_freelists[nid])) { - for (nid = 0; nid < MAX_NUMNODES; ++nid) - if (!list_empty(&hugepage_freelists[nid])) - break; + for (z = zonelist->zones; *z; z++) { + nid = (*z)->zone_pgdat->node_id; + if (!list_empty(&hugepage_freelists[nid])) + break; } - if (nid >= 0 && nid < MAX_NUMNODES && - !list_empty(&hugepage_freelists[nid])) { + + if (*z) { page = list_entry(hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); @@ -85,13 +90,13 @@ void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); } -struct page *alloc_huge_page(void) +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { struct page *page; int i; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(); + page = dequeue_huge_page(vma, addr); if (!page) { spin_unlock(&hugetlb_lock); return NULL; @@ -194,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count) spin_lock(&hugetlb_lock); try_to_free_low(count); while (count < nr_huge_pages) { - struct page *page = dequeue_huge_page(); + struct page *page = dequeue_huge_page(NULL, 0); if (!page) break; update_and_free_page(page); @@ -261,11 +266,12 @@ struct vm_operations_struct hugetlb_vm_ops = { .nopage = hugetlb_nopage, }; -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, + int writable) { pte_t entry; - if (vma->vm_flags & VM_WRITE) { + if (writable) { entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); } else { @@ -277,12 +283,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) return entry; } +static void set_huge_ptep_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t entry; + + entry = pte_mkwrite(pte_mkdirty(*ptep)); + ptep_set_access_flags(vma, address, ptep, entry, 1); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); +} + + int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pte_t *src_pte, *dst_pte, entry; struct page *ptepage; unsigned long addr; + int cow; + + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { src_pte = huge_pte_offset(src, addr); @@ -294,6 +315,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_lock(&dst->page_table_lock); spin_lock(&src->page_table_lock); if (!pte_none(*src_pte)) { + if (cow) + ptep_set_wrprotect(src, addr, src_pte); entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); @@ -345,57 +368,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -static struct page *find_lock_huge_page(struct address_space *mapping, - unsigned long idx) +static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, pte_t pte) { - struct page *page; - int err; - struct inode *inode = mapping->host; - unsigned long size; + struct page *old_page, *new_page; + int i, avoidcopy; -retry: - page = find_lock_page(mapping, idx); - if (page) - goto out; + old_page = pte_page(pte); - /* Check to make sure the mapping hasn't been truncated */ - size = i_size_read(inode) >> HPAGE_SHIFT; - if (idx >= size) - goto out; + /* If no-one else is actually using this page, avoid the copy + * and just make the page writable */ + avoidcopy = (page_count(old_page) == 1); + if (avoidcopy) { + set_huge_ptep_writable(vma, address, ptep); + return VM_FAULT_MINOR; + } - if (hugetlb_get_quota(mapping)) - goto out; - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - goto out; + page_cache_get(old_page); + new_page = alloc_huge_page(vma, address); + + if (!new_page) { + page_cache_release(old_page); + + /* Logically this is OOM, not a SIGBUS, but an OOM + * could cause the kernel to go killing other + * processes which won't help the hugepage situation + * at all (?) */ + return VM_FAULT_SIGBUS; } - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); - if (err) { - put_page(page); - hugetlb_put_quota(mapping); - if (err == -EEXIST) - goto retry; - page = NULL; + spin_unlock(&mm->page_table_lock); + for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) + copy_user_highpage(new_page + i, old_page + i, + address + i*PAGE_SIZE); + spin_lock(&mm->page_table_lock); + + ptep = huge_pte_offset(mm, address & HPAGE_MASK); + if (likely(pte_same(*ptep, pte))) { + /* Break COW */ + set_huge_pte_at(mm, address, ptep, + make_huge_pte(vma, new_page, 1)); + /* Make the old page be freed below */ + new_page = old_page; } -out: - return page; + page_cache_release(new_page); + page_cache_release(old_page); + return VM_FAULT_MINOR; } -int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) +int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, int write_access) { int ret = VM_FAULT_SIGBUS; unsigned long idx; unsigned long size; - pte_t *pte; struct page *page; struct address_space *mapping; - - pte = huge_pte_alloc(mm, address); - if (!pte) - goto out; + pte_t new_pte; mapping = vma->vm_file->f_mapping; idx = ((address - vma->vm_start) >> HPAGE_SHIFT) @@ -405,9 +434,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_lock_huge_page(mapping, idx); - if (!page) - goto out; +retry: + page = find_lock_page(mapping, idx); + if (!page) { + if (hugetlb_get_quota(mapping)) + goto out; + page = alloc_huge_page(vma, address); + if (!page) { + hugetlb_put_quota(mapping); + goto out; + } + + if (vma->vm_flags & VM_SHARED) { + int err; + + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + if (err) { + put_page(page); + hugetlb_put_quota(mapping); + if (err == -EEXIST) + goto retry; + goto out; + } + } else + lock_page(page); + } spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> HPAGE_SHIFT; @@ -415,11 +466,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto backout; ret = VM_FAULT_MINOR; - if (!pte_none(*pte)) + if (!pte_none(*ptep)) goto backout; add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); - set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, address, ptep, new_pte); + + if (write_access && !(vma->vm_flags & VM_SHARED)) { + /* Optimization, do the COW without a second fault */ + ret = hugetlb_cow(mm, vma, address, ptep, new_pte); + } + spin_unlock(&mm->page_table_lock); unlock_page(page); out: @@ -433,6 +492,33 @@ backout: goto out; } +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access) +{ + pte_t *ptep; + pte_t entry; + int ret; + + ptep = huge_pte_alloc(mm, address); + if (!ptep) + return VM_FAULT_OOM; + + entry = *ptep; + if (pte_none(entry)) + return hugetlb_no_page(mm, vma, address, ptep, write_access); + + ret = VM_FAULT_MINOR; + + spin_lock(&mm->page_table_lock); + /* Check for a racing update before calling hugetlb_cow */ + if (likely(pte_same(entry, *ptep))) + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry); + spin_unlock(&mm->page_table_lock); + + return ret; +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i) diff --git a/mm/internal.h b/mm/internal.h index 6bf134e8fb3d..17256bb2f4ef 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -9,5 +9,22 @@ * 2 of the License, or (at your option) any later version. */ -/* page_alloc.c */ -extern void set_page_refs(struct page *page, int order); +static inline void set_page_refs(struct page *page, int order) +{ +#ifdef CONFIG_MMU + set_page_count(page, 1); +#else + int i; + + /* + * We need to reference all the pages for this order, otherwise if + * anyone accesses one of the pages with (get/put) it will be freed. + * - eg: access_process_vm() + */ + for (i = 0; i < (1 << order); i++) + set_page_count(page + i, 1); +#endif /* CONFIG_MMU */ +} + +extern void fastcall __init __free_pages_bootmem(struct page *page, + unsigned int order); diff --git a/mm/madvise.c b/mm/madvise.c index 2b7cf0400a21..ae0ae3ea299a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma, return 0; } +/* + * Application wants to free up the pages and associated backing store. + * This is effectively punching a hole into the middle of a file. + * + * NOTE: Currently, only shmfs/tmpfs is supported for this operation. + * Other filesystems return -ENOSYS. + */ +static long madvise_remove(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct address_space *mapping; + loff_t offset, endoff; + + if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + return -EINVAL; + + if (!vma->vm_file || !vma->vm_file->f_mapping + || !vma->vm_file->f_mapping->host) { + return -EINVAL; + } + + mapping = vma->vm_file->f_mapping; + + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + endoff = (loff_t)(end - vma->vm_start - 1) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + return vmtruncate_range(mapping->host, offset, endoff); +} + static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) @@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_RANDOM: error = madvise_behavior(vma, prev, start, end, behavior); break; + case MADV_REMOVE: + error = madvise_remove(vma, start, end); + break; case MADV_WILLNEED: error = madvise_willneed(vma, prev, start, end); @@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_REMOVE - the application wants to free up the given range of + * pages and associated backing store. * * return values: * zero - success diff --git a/mm/memory.c b/mm/memory.c index aa8af0e20269..7197f9bcd384 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -349,6 +349,11 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) dump_stack(); } +static inline int is_cow_mapping(unsigned int flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + /* * This function gets the "struct page" associated with a pte. * @@ -377,6 +382,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; } /* @@ -437,7 +444,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * If it's a COW mapping, write protect it both * in the parent and the child */ - if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { + if (is_cow_mapping(vm_flags)) { ptep_set_wrprotect(src_mm, addr, src_pte); pte = *src_pte; } @@ -567,7 +574,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) { + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { if (!vma->anon_vma) return 0; } @@ -1002,7 +1009,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (vma->vm_flags & VM_IO) + if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; @@ -1221,55 +1228,12 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page * return -EFAULT; if (!page_count(page)) return -EINVAL; + vma->vm_flags |= VM_INSERTPAGE; return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); /* - * Somebody does a pfn remapping that doesn't actually work as a vma. - * - * Do it as individual pages instead, and warn about it. It's bad form, - * and very inefficient. - */ -static int incomplete_pfn_remap(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long pfn, pgprot_t prot) -{ - static int warn = 10; - struct page *page; - int retval; - - if (!(vma->vm_flags & VM_INCOMPLETE)) { - if (warn) { - warn--; - printk("%s does an incomplete pfn remapping", current->comm); - dump_stack(); - } - } - vma->vm_flags |= VM_INCOMPLETE | VM_IO | VM_RESERVED; - - if (start < vma->vm_start || end > vma->vm_end) - return -EINVAL; - - if (!pfn_valid(pfn)) - return -EINVAL; - - page = pfn_to_page(pfn); - if (!PageReserved(page)) - return -EINVAL; - - retval = 0; - while (start < end) { - retval = insert_page(vma->vm_mm, start, page, prot); - if (retval < 0) - break; - start += PAGE_SIZE; - page++; - } - return retval; -} - -/* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") @@ -1343,9 +1307,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, struct mm_struct *mm = vma->vm_mm; int err; - if (addr != vma->vm_start || end != vma->vm_end) - return incomplete_pfn_remap(vma, addr, end, pfn, prot); - /* * Physically remapped pages are special. Tell the * rest of the world about it: @@ -1359,9 +1320,18 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. + * + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". */ + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; + vma->vm_pgoff = pfn; + } + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - vma->vm_pgoff = pfn; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -1528,7 +1498,7 @@ gotten: update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); lru_cache_add_active(new_page); - page_add_anon_rmap(new_page, vma, address); + page_add_new_anon_rmap(new_page, vma, address); /* Free the old page.. */ new_page = old_page; @@ -1800,9 +1770,32 @@ out_big: out_busy: return -ETXTBSY; } - EXPORT_SYMBOL(vmtruncate); +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + */ + if (!inode->i_op || !inode->i_op->truncate_range) + return -ENOSYS; + + down(&inode->i_sem); + down_write(&inode->i_alloc_sem); + unmap_mapping_range(mapping, offset, (end - offset), 1); + truncate_inode_pages_range(mapping, offset, end); + inode->i_op->truncate_range(inode, offset, end); + up_write(&inode->i_alloc_sem); + up(&inode->i_sem); + + return 0; +} +EXPORT_SYMBOL(vmtruncate_range); + /* * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen @@ -1984,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto release; inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); - SetPageReferenced(page); - page_add_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address); } else { /* Map the ZERO_PAGE - vm_page_prot is readonly */ page = ZERO_PAGE(address); @@ -2116,7 +2108,7 @@ retry: if (anon) { inc_mm_counter(mm, anon_rss); lru_cache_add_active(new_page); - page_add_anon_rmap(new_page, vma, address); + page_add_new_anon_rmap(new_page, vma, address); } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 431a64f021c0..a918f77f02f3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, int nr_pages); static int __add_section(struct zone *zone, unsigned long phys_start_pfn) { - struct pglist_data *pgdat = zone->zone_pgdat; int nr_pages = PAGES_PER_SECTION; int ret; @@ -104,7 +103,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat, pgdat->node_start_pfn = start_pfn; if (end_pfn > old_pgdat_end_pfn) - pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; + pgdat->node_spanned_pages = end_pfn - pgdat->node_start_pfn; } int online_pages(unsigned long pfn, unsigned long nr_pages) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bec88c81244e..0f1d2b8a952b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -93,7 +93,7 @@ static kmem_cache_t *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ -static int policy_zone; +int policy_zone = ZONE_DMA; struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ @@ -131,17 +131,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) if (!zl) return NULL; num = 0; - for_each_node_mask(nd, *nodes) { - int k; - for (k = MAX_NR_ZONES-1; k >= 0; k--) { - struct zone *z = &NODE_DATA(nd)->node_zones[k]; - if (!z->present_pages) - continue; - zl->zones[num++] = z; - if (k > policy_zone) - policy_zone = k; - } - } + for_each_node_mask(nd, *nodes) + zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; zl->zones[num] = NULL; return zl; } @@ -161,6 +152,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) switch (mode) { case MPOL_INTERLEAVE: policy->v.nodes = *nodes; + if (nodes_weight(*nodes) == 0) { + kmem_cache_free(policy_cache, policy); + return ERR_PTR(-EINVAL); + } break; case MPOL_PREFERRED: policy->v.preferred_node = first_node(*nodes); @@ -781,6 +776,34 @@ static unsigned offset_il_node(struct mempolicy *pol, return nid; } +/* Determine a node number for interleave */ +static inline unsigned interleave_nid(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long addr, int shift) +{ + if (vma) { + unsigned long off; + + off = vma->vm_pgoff; + off += (addr - vma->vm_start) >> shift; + return offset_il_node(pol, vma, off); + } else + return interleave_nodes(pol); +} + +/* Return a zonelist suitable for a huge page allocation. */ +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(current, vma, addr); + + if (pol->policy == MPOL_INTERLEAVE) { + unsigned nid; + + nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); + return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); + } + return zonelist_policy(GFP_HIGHUSER, pol); +} + /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, @@ -829,15 +852,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) if (unlikely(pol->policy == MPOL_INTERLEAVE)) { unsigned nid; - if (vma) { - unsigned long off; - off = vma->vm_pgoff; - off += (addr - vma->vm_start) >> PAGE_SHIFT; - nid = offset_il_node(pol, vma, off); - } else { - /* fall back to process interleaving */ - nid = interleave_nodes(pol); - } + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); return alloc_page_interleave(gfp, 0, nid); } return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); @@ -936,54 +952,6 @@ void __mpol_free(struct mempolicy *p) } /* - * Hugetlb policy. Same as above, just works with node numbers instead of - * zonelists. - */ - -/* Find first node suitable for an allocation */ -int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol = get_vma_policy(current, vma, addr); - - switch (pol->policy) { - case MPOL_DEFAULT: - return numa_node_id(); - case MPOL_BIND: - return pol->v.zonelist->zones[0]->zone_pgdat->node_id; - case MPOL_INTERLEAVE: - return interleave_nodes(pol); - case MPOL_PREFERRED: - return pol->v.preferred_node >= 0 ? - pol->v.preferred_node : numa_node_id(); - } - BUG(); - return 0; -} - -/* Find secondary valid nodes for an allocation */ -int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol = get_vma_policy(current, vma, addr); - - switch (pol->policy) { - case MPOL_PREFERRED: - case MPOL_DEFAULT: - case MPOL_INTERLEAVE: - return 1; - case MPOL_BIND: { - struct zone **z; - for (z = pol->v.zonelist->zones; *z; z++) - if ((*z)->zone_pgdat->node_id == nid) - return 1; - return 0; - } - default: - BUG(); - return 0; - } -} - -/* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. diff --git a/mm/mmap.c b/mm/mmap.c index 11ca5927d5ff..64ba4dbcb7de 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -611,7 +611,7 @@ again: remove_next = 1 + (end > next->vm_end); * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) diff --git a/mm/mremap.c b/mm/mremap.c index b535438c363c..ddaeee9a0b69 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -323,7 +323,7 @@ unsigned long do_mremap(unsigned long addr, /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) goto out; - if (vma->vm_flags & VM_DONTEXPAND) { + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { if (new_len > old_len) goto out; } diff --git a/mm/nommu.c b/mm/nommu.c index c1196812876b..c10262d68232 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } + +struct page *filemap_nopage(struct vm_area_struct *area, + unsigned long address, int *type) +{ + BUG(); + return NULL; +} diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0166ea15c9ee..5240e426c1f7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -550,11 +550,17 @@ void __init page_writeback_init(void) int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { + int ret; + if (wbc->nr_to_write <= 0) return 0; + wbc->for_writepages = 1; if (mapping->a_ops->writepages) - return mapping->a_ops->writepages(mapping, wbc); - return generic_writepages(mapping, wbc); + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + wbc->for_writepages = 0; + return ret; } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3b21a13d841c..fd47494cb989 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -36,6 +36,7 @@ #include <linux/memory_hotplug.h> #include <linux/nodemask.h> #include <linux/vmalloc.h> +#include <linux/mempolicy.h> #include <asm/tlbflush.h> #include "internal.h" @@ -53,6 +54,8 @@ unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; +static void fastcall free_hot_cold_page(struct page *page, int cold); + /* * results with 256, 32 in the lowmem_reserve sysctl: * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) @@ -81,6 +84,7 @@ int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; +#ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { int ret = 0; @@ -122,16 +126,23 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } -static void bad_page(const char *function, struct page *page) +#else +static inline int bad_range(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + +static void bad_page(struct page *page) { - printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", - function, current->comm, page); - printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, - page->mapping, page_mapcount(page), page_count(page)); - printk(KERN_EMERG "Backtrace:\n"); + printk(KERN_EMERG "Bad page state in process '%s'\n" + "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" + "Trying to fix it up, but a reboot is needed\n" + "Backtrace:\n", + current->comm, page, (int)(2*sizeof(unsigned long)), + (unsigned long)page->flags, page->mapping, + page_mapcount(page), page_count(page)); dump_stack(); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); page->flags &= ~(1 << PG_lru | 1 << PG_private | 1 << PG_locked | @@ -184,19 +195,15 @@ static void destroy_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - if (!PageCompound(page)) - return; - - if (page[1].index != order) - bad_page(__FUNCTION__, page); + if (unlikely(page[1].index != order)) + bad_page(page); for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - if (!PageCompound(p)) - bad_page(__FUNCTION__, page); - if (page_private(p) != (unsigned long)page) - bad_page(__FUNCTION__, page); + if (unlikely(!PageCompound(p) | + (page_private(p) != (unsigned long)page))) + bad_page(page); ClearPageCompound(p); } } @@ -255,14 +262,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is free && - * (b) the buddy is on the buddy system && - * (c) a page and its buddy have the same order. + * (a) the buddy is not in a hole && + * (b) the buddy is free && + * (c) the buddy is on the buddy system && + * (d) a page and its buddy have the same order. * for recording page's order, we use page_private(page) and PG_private. * */ static inline int page_is_buddy(struct page *page, int order) { +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (PagePrivate(page) && (page_order(page) == order) && page_count(page) == 0) @@ -300,7 +313,7 @@ static inline void __free_pages_bulk (struct page *page, unsigned long page_idx; int order_size = 1 << order; - if (unlikely(order)) + if (unlikely(PageCompound(page))) destroy_compound_page(page, order); page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); @@ -314,17 +327,15 @@ static inline void __free_pages_bulk (struct page *page, struct free_area *area; struct page *buddy; - combined_idx = __find_combined_index(page_idx, order); buddy = __page_find_buddy(page, page_idx, order); - - if (bad_range(zone, buddy)) - break; if (!page_is_buddy(buddy, order)) break; /* Move the buddy up one level. */ + list_del(&buddy->lru); area = zone->free_area + order; area->nr_free--; rmv_page_order(buddy); + combined_idx = __find_combined_index(page_idx, order); page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; @@ -334,11 +345,11 @@ static inline void __free_pages_bulk (struct page *page, zone->free_area[order].nr_free++; } -static inline int free_pages_check(const char *function, struct page *page) +static inline int free_pages_check(struct page *page) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -348,8 +359,8 @@ static inline int free_pages_check(const char *function, struct page *page) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved ))) - bad_page(function, page); + 1 << PG_reserved )))) + bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); /* @@ -375,11 +386,10 @@ static int free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order) { - unsigned long flags; struct page *page = NULL; int ret = 0; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (!list_empty(list) && count--) { @@ -389,12 +399,13 @@ free_pages_bulk(struct zone *zone, int count, __free_pages_bulk(page, zone, order); ret++; } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return ret; } void __free_pages_ok(struct page *page, unsigned int order) { + unsigned long flags; LIST_HEAD(list); int i; int reserved = 0; @@ -408,14 +419,49 @@ void __free_pages_ok(struct page *page, unsigned int order) #endif for (i = 0 ; i < (1 << order) ; ++i) - reserved += free_pages_check(__FUNCTION__, page + i); + reserved += free_pages_check(page + i); if (reserved) return; list_add(&page->lru, &list); - mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<<order, 0); + local_irq_save(flags); + __mod_page_state(pgfree, 1 << order); free_pages_bulk(page_zone(page), 1, &list, order); + local_irq_restore(flags); +} + +/* + * permit the bootmem allocator to evade page validation on high-order frees + */ +void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) +{ + if (order == 0) { + __ClearPageReserved(page); + set_page_count(page, 0); + + free_hot_cold_page(page, 0); + } else { + LIST_HEAD(list); + int loop; + + for (loop = 0; loop < BITS_PER_LONG; loop++) { + struct page *p = &page[loop]; + + if (loop + 16 < BITS_PER_LONG) + prefetchw(p + 16); + __ClearPageReserved(p); + set_page_count(p, 0); + } + + arch_free_page(page, order); + + mod_page_state(pgfree, 1 << order); + + list_add(&page->lru, &list); + kernel_map_pages(page, 1 << order, 0); + free_pages_bulk(page_zone(page), 1, &list, order); + } } @@ -433,8 +479,7 @@ void __free_pages_ok(struct page *page, unsigned int order) * * -- wli */ -static inline struct page * -expand(struct zone *zone, struct page *page, +static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area) { unsigned long size = 1 << high; @@ -448,24 +493,6 @@ expand(struct zone *zone, struct page *page, area->nr_free++; set_page_order(&page[size], high); } - return page; -} - -void set_page_refs(struct page *page, int order) -{ -#ifdef CONFIG_MMU - set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ } /* @@ -473,9 +500,9 @@ void set_page_refs(struct page *page, int order) */ static int prep_new_page(struct page *page, int order) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -486,8 +513,8 @@ static int prep_new_page(struct page *page, int order) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved ))) - bad_page(__FUNCTION__, page); + 1 << PG_reserved )))) + bad_page(page); /* * For now, we report if PG_reserved was found set, but do not @@ -525,7 +552,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) rmv_page_order(page); area->nr_free--; zone->free_pages -= 1UL << order; - return expand(zone, page, order, current_order, area); + expand(zone, page, order, current_order, area); + return page; } return NULL; @@ -539,21 +567,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list) { - unsigned long flags; int i; - int allocated = 0; - struct page *page; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + struct page *page = __rmqueue(zone, order); + if (unlikely(page == NULL)) break; - allocated++; list_add_tail(&page->lru, list); } - spin_unlock_irqrestore(&zone->lock, flags); - return allocated; + spin_unlock(&zone->lock); + return i; } #ifdef CONFIG_NUMA @@ -589,6 +613,7 @@ void drain_remote_pages(void) #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { + unsigned long flags; struct zone *zone; int i; @@ -600,8 +625,10 @@ static void __drain_pages(unsigned int cpu) struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; + local_irq_save(flags); pcp->count -= free_pages_bulk(zone, pcp->count, &pcp->list, 0); + local_irq_restore(flags); } } } @@ -647,18 +674,14 @@ void drain_local_pages(void) } #endif /* CONFIG_PM */ -static void zone_statistics(struct zonelist *zonelist, struct zone *z) +static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) { #ifdef CONFIG_NUMA - unsigned long flags; - int cpu; pg_data_t *pg = z->zone_pgdat; pg_data_t *orig = zonelist->zones[0]->zone_pgdat; struct per_cpu_pageset *p; - local_irq_save(flags); - cpu = smp_processor_id(); - p = zone_pcp(z,cpu); + p = zone_pcp(z, cpu); if (pg == orig) { p->numa_hit++; } else { @@ -669,14 +692,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) p->local_node++; else p->other_node++; - local_irq_restore(flags); #endif } /* * Free a 0-order page */ -static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); @@ -687,14 +708,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (PageAnon(page)) page->mapping = NULL; - if (free_pages_check(__FUNCTION__, page)) + if (free_pages_check(page)) return; - inc_page_state(pgfree); kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); + __inc_page_state(pgfree); list_add(&page->lru, &pcp->list); pcp->count++; if (pcp->count >= pcp->high) @@ -727,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page * -buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) +static struct page *buffered_rmqueue(struct zonelist *zonelist, + struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); + int cpu; again: + cpu = get_cpu(); if (order == 0) { struct per_cpu_pages *pcp; - page = NULL; - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) + if (!pcp->count) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); - if (pcp->count) { - page = list_entry(pcp->list.next, struct page, lru); - list_del(&page->lru); - pcp->count--; + if (unlikely(!pcp->count)) + goto failed; } - local_irq_restore(flags); - put_cpu(); + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); + if (!page) + goto failed; } - if (page != NULL) { - BUG_ON(bad_range(zone, page)); - mod_page_state_zone(zone, pgalloc, 1 << order); - if (prep_new_page(page, order)) - goto again; + __mod_page_state_zone(zone, pgalloc, 1 << order); + zone_statistics(zonelist, zone, cpu); + local_irq_restore(flags); + put_cpu(); + + BUG_ON(bad_range(zone, page)); + if (prep_new_page(page, order)) + goto again; - if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); - } + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); return page; + +failed: + local_irq_restore(flags); + put_cpu(); + return NULL; } #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ @@ -845,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, continue; } - page = buffered_rmqueue(*z, order, gfp_mask); + page = buffered_rmqueue(zonelist, *z, order, gfp_mask); if (page) { - zone_statistics(zonelist, *z); break; } } while (*(++z) != NULL); @@ -903,8 +932,7 @@ restart: alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; - if (wait) - alloc_flags |= ALLOC_CPUSET; + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations @@ -926,7 +954,7 @@ restart: nofail_alloc: /* go through the zonelist yet again, ignoring mins */ page = get_page_from_freelist(gfp_mask, order, - zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); + zonelist, ALLOC_NO_WATERMARKS); if (page) goto got_pg; if (gfp_mask & __GFP_NOFAIL) { @@ -1171,12 +1199,11 @@ EXPORT_SYMBOL(nr_pagecache); DEFINE_PER_CPU(long, nr_pagecache_local) = 0; #endif -void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) +static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { int cpu = 0; memset(ret, 0, sizeof(*ret)); - cpus_and(*cpumask, *cpumask, cpu_online_map); cpu = first_cpu(*cpumask); while (cpu < NR_CPUS) { @@ -1224,12 +1251,12 @@ void get_full_page_state(struct page_state *ret) __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); } -unsigned long __read_page_state(unsigned long offset) +unsigned long read_page_state_offset(unsigned long offset) { unsigned long ret = 0; int cpu; - for_each_online_cpu(cpu) { + for_each_cpu(cpu) { unsigned long in; in = (unsigned long)&per_cpu(page_states, cpu) + offset; @@ -1238,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset) return ret; } -void __mod_page_state(unsigned long offset, unsigned long delta) +void __mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + void *ptr; + + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; +} +EXPORT_SYMBOL(__mod_page_state_offset); + +void mod_page_state_offset(unsigned long offset, unsigned long delta) { unsigned long flags; - void* ptr; + void *ptr; local_irq_save(flags); ptr = &__get_cpu_var(page_states); - *(unsigned long*)(ptr + offset) += delta; + *(unsigned long *)(ptr + offset) += delta; local_irq_restore(flags); } - -EXPORT_SYMBOL(__mod_page_state); +EXPORT_SYMBOL(mod_page_state_offset); void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) @@ -1335,7 +1370,7 @@ void show_free_areas(void) show_node(zone); printk("%s per-cpu:", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk(" empty\n"); continue; } else @@ -1347,10 +1382,9 @@ void show_free_areas(void) pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", + printk("cpu %d %s: high %d, batch %d used:%d\n", cpu, temperature ? "cold" : "hot", - pageset->pcp[temperature].low, pageset->pcp[temperature].high, pageset->pcp[temperature].batch, pageset->pcp[temperature].count); @@ -1413,7 +1447,7 @@ void show_free_areas(void) show_node(zone); printk("%s: ", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk("empty\n"); continue; } @@ -1433,36 +1467,29 @@ void show_free_areas(void) /* * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) -{ - switch (k) { - struct zone *zone; - default: - BUG(); - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { +static int __init build_zonelists_node(pg_data_t *pgdat, + struct zonelist *zonelist, int nr_zones, int zone_type) +{ + struct zone *zone; + + BUG_ON(zone_type > ZONE_HIGHMEM); + + do { + zone = pgdat->node_zones + zone_type; + if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG(); + BUG_ON(zone_type > ZONE_NORMAL); #endif - zonelist->zones[j++] = zone; + zonelist->zones[nr_zones++] = zone; + check_highest_zone(zone_type); } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA32: - zone = pgdat->node_zones + ZONE_DMA32; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) - zonelist->zones[j++] = zone; - } + zone_type--; - return j; + } while (zone_type >= 0); + return nr_zones; } static inline int highest_zone(int zone_bits) @@ -1709,8 +1736,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { if (!early_pfn_valid(pfn)) continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); set_page_count(page, 1); @@ -1794,14 +1819,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp[0]; /* hot */ pcp->count = 0; - pcp->low = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); pcp = &p->pcp[1]; /* cold*/ pcp->count = 0; - pcp->low = 0; pcp->high = 2 * batch; pcp->batch = max(1UL, batch/2); INIT_LIST_HEAD(&pcp->list); @@ -1896,7 +1919,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, static struct notifier_block pageset_notifier = { &pageset_cpuup_callback, NULL, 0 }; -void __init setup_per_cpu_pageset() +void __init setup_per_cpu_pageset(void) { int err; @@ -2116,7 +2139,7 @@ static int frag_show(struct seq_file *m, void *arg) int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!zone->present_pages) + if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2149,7 +2172,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { int i; - if (!zone->present_pages) + if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2197,12 +2220,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, "\n cpu: %i pcp: %i" "\n count: %i" - "\n low: %i" "\n high: %i" "\n batch: %i", i, j, pageset->pcp[j].count, - pageset->pcp[j].low, pageset->pcp[j].high, pageset->pcp[j].batch); } @@ -2257,32 +2278,40 @@ static char *vmstat_text[] = { "pgpgout", "pswpin", "pswpout", - "pgalloc_high", + "pgalloc_high", "pgalloc_normal", + "pgalloc_dma32", "pgalloc_dma", + "pgfree", "pgactivate", "pgdeactivate", "pgfault", "pgmajfault", + "pgrefill_high", "pgrefill_normal", + "pgrefill_dma32", "pgrefill_dma", "pgsteal_high", "pgsteal_normal", + "pgsteal_dma32", "pgsteal_dma", + "pgscan_kswapd_high", "pgscan_kswapd_normal", - + "pgscan_kswapd_dma32", "pgscan_kswapd_dma", + "pgscan_direct_high", "pgscan_direct_normal", + "pgscan_direct_dma32", "pgscan_direct_dma", - "pginodesteal", + "pginodesteal", "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", diff --git a/mm/readahead.c b/mm/readahead.c index 72e7adbb87c7..8d6eeaaa6296 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, { unsigned page_idx; struct pagevec lru_pvec; - int ret = 0; + int ret; if (mapping->a_ops->readpages) { ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); @@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp, list_del(&page->lru); if (!add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { - mapping->a_ops->readpage(filp, page); - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - } else { - page_cache_release(page); + ret = mapping->a_ops->readpage(filp, page); + if (ret != AOP_TRUNCATED_PAGE) { + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + continue; + } /* else fall through to release */ } + page_cache_release(page); } pagevec_lru_add(&lru_pvec); + ret = 0; out: return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index f853c6def159..6f3f7db27128 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked) } /** + * page_set_anon_rmap - setup new anonymous rmap + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + */ +static void __page_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + + page->index = linear_page_index(vma, address); + + /* + * nr_mapped state can be updated without turning off + * interrupts because it is not modified via interrupt. + */ + __inc_page_state(nr_mapped); +} + +/** * page_add_anon_rmap - add pte mapping to an anonymous page * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added @@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked) void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - if (atomic_inc_and_test(&page->_mapcount)) { - struct anon_vma *anon_vma = vma->anon_vma; - - BUG_ON(!anon_vma); - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - page->mapping = (struct address_space *) anon_vma; - - page->index = linear_page_index(vma, address); - - inc_page_state(nr_mapped); - } + if (atomic_inc_and_test(&page->_mapcount)) + __page_set_anon_rmap(page, vma, address); /* else checking page index and mapping is racy */ } +/* + * page_add_new_anon_rmap - add pte mapping to a new anonymous page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + * + * Same as page_add_anon_rmap but must only be called on *new* pages. + * This means the inc-and-test can be bypassed. + */ +void page_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ + __page_set_anon_rmap(page, vma, address); +} + /** * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to @@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page) BUG_ON(!pfn_valid(page_to_pfn(page))); if (atomic_inc_and_test(&page->_mapcount)) - inc_page_state(nr_mapped); + __inc_page_state(nr_mapped); } /** @@ -495,7 +526,7 @@ void page_remove_rmap(struct page *page) */ if (page_test_and_clear_dirty(page)) set_page_dirty(page); - dec_page_state(nr_mapped); + __dec_page_state(nr_mapped); } } diff --git a/mm/shmem.c b/mm/shmem.c index dc25565a61e9..a1f2f02af724 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next) } while (next); } -static void shmem_truncate(struct inode *inode) +static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) { struct shmem_inode_info *info = SHMEM_I(inode); unsigned long idx; @@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode) long nr_swaps_freed = 0; int offset; int freed; + int punch_hole = 0; inode->i_ctime = inode->i_mtime = CURRENT_TIME; - idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (idx >= info->next_index) return; spin_lock(&info->lock); info->flags |= SHMEM_TRUNCATE; - limit = info->next_index; - info->next_index = idx; + if (likely(end == (loff_t) -1)) { + limit = info->next_index; + info->next_index = idx; + } else { + limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (limit > info->next_index) + limit = info->next_index; + punch_hole = 1; + } + topdir = info->i_indirect; - if (topdir && idx <= SHMEM_NR_DIRECT) { + if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { info->i_indirect = NULL; nr_pages_to_free++; list_add(&topdir->lru, &pages_to_free); @@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode) set_page_private(subdir, page_private(subdir) - freed); if (offset) spin_unlock(&info->lock); - BUG_ON(page_private(subdir) > offset); + if (!punch_hole) + BUG_ON(page_private(subdir) > offset); } if (offset) offset = 0; - else if (subdir) { + else if (subdir && !page_private(subdir)) { dir[diroff] = NULL; nr_pages_to_free++; list_add(&subdir->lru, &pages_to_free); @@ -594,7 +604,7 @@ done2: * Also, though shmem_getpage checks i_size before adding to * cache, no recheck after: so fix the narrow window there too. */ - truncate_inode_pages(inode->i_mapping, inode->i_size); + truncate_inode_pages_range(inode->i_mapping, start, end); } spin_lock(&info->lock); @@ -614,6 +624,11 @@ done2: } } +static void shmem_truncate(struct inode *inode) +{ + shmem_truncate_range(inode, inode->i_size, (loff_t)-1); +} + static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; @@ -855,7 +870,7 @@ unlock: swap_free(swap); redirty: set_page_dirty(page); - return WRITEPAGE_ACTIVATE; /* Return with the page locked */ + return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ } #ifdef CONFIG_NUMA @@ -1255,7 +1270,7 @@ out_nomem: return retval; } -static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; @@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = { static struct inode_operations shmem_inode_operations = { .truncate = shmem_truncate, .setattr = shmem_notify_change, + .truncate_range = shmem_truncate_range, }; static struct inode_operations shmem_dir_inode_operations = { diff --git a/mm/swap.c b/mm/swap.c index 73d351439ef6..ee6d71ccfa56 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page) put_cpu_var(lru_add_active_pvecs); } -void lru_add_drain(void) +static void __lru_add_drain(int cpu) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); + /* CPU is dead, so no locking needed. */ if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &__get_cpu_var(lru_add_active_pvecs); + pvec = &per_cpu(lru_add_active_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_pvecs); +} + +void lru_add_drain(void) +{ + __lru_add_drain(get_cpu()); + put_cpu(); } /* @@ -412,17 +418,6 @@ void vm_acct_memory(long pages) } #ifdef CONFIG_HOTPLUG_CPU -static void lru_drain_cache(unsigned int cpu) -{ - struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); - - /* CPU is dead, so no locking needed. */ - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); - pvec = &per_cpu(lru_add_active_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); -} /* Drop the CPU's cached committed space back into the central pool. */ static int cpu_swap_callback(struct notifier_block *nfb, @@ -435,7 +430,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, if (action == CPU_DEAD) { atomic_add(*committed, &vm_committed_space); *committed = 0; - lru_drain_cache((long)hcpu); + __lru_add_drain((long)hcpu); } return NOTIFY_OK; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 0df9a57b1de8..fc2aecb70a95 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/backing-dev.h> +#include <linux/pagevec.h> #include <asm/pgtable.h> @@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page) */ void free_pages_and_swap_cache(struct page **pages, int nr) { - int chunk = 16; struct page **pagep = pages; lru_add_drain(); while (nr) { - int todo = min(chunk, nr); + int todo = min(nr, PAGEVEC_SIZE); int i; for (i = 0; i < todo; i++) diff --git a/mm/swapfile.c b/mm/swapfile.c index edafeace301f..6da4b28b896b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -211,6 +211,26 @@ noswap: return (swp_entry_t) {0}; } +swp_entry_t get_swap_page_of_type(int type) +{ + struct swap_info_struct *si; + pgoff_t offset; + + spin_lock(&swap_lock); + si = swap_info + type; + if (si->flags & SWP_WRITEOK) { + nr_swap_pages--; + offset = scan_swap_map(si); + if (offset) { + spin_unlock(&swap_lock); + return swp_entry(type, offset); + } + nr_swap_pages++; + } + spin_unlock(&swap_lock); + return (swp_entry_t) {0}; +} + static struct swap_info_struct * swap_info_get(swp_entry_t entry) { struct swap_info_struct * p; diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index b58abcf44ed6..cdc6d431972b 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) goto close_file; d_instantiate(dentry, inode); - inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ + file->f_vfsmnt = mntget(shm_mnt); file->f_dentry = dentry; file->f_mapping = inode->i_mapping; file->f_op = &ramfs_file_operations; file->f_mode = FMODE_WRITE | FMODE_READ; + + /* notify everyone as to the change of file size */ + error = do_truncate(dentry, size, file); + if (error < 0) + goto close_file; + return file; close_file: @@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page) { return 0; } + +int shmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); +#ifndef CONFIG_MMU + return ramfs_nommu_mmap(file, vma); +#else + return 0; +#endif +} + +#ifndef CONFIG_MMU +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); +} +#endif diff --git a/mm/truncate.c b/mm/truncate.c index 9173ab500604..7dee32745901 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) } /** - * truncate_inode_pages - truncate *all* the pages from an offset + * truncate_inode_pages - truncate range of pages specified by start and + * end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate + * @lend: offset to which to truncate * - * Truncate the page cache at a set offset, removing the pages that are beyond - * that offset (and zeroing out partial pages). + * Truncate the page cache, removing the pages that are between + * specified offsets (and zeroing out partial page + * (if lstart is not page aligned)). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass @@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. - * - * Called under (and serialised by) inode->i_sem. */ -void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +void truncate_inode_pages_range(struct address_space *mapping, + loff_t lstart, loff_t lend) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + pgoff_t end; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t next; @@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) if (mapping->nrpages == 0) return; + BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); + end = (lend >> PAGE_CACHE_SHIFT); + pagevec_init(&pvec, 0); next = start; - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (next <= end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; + if (page_index > end) { + next = page_index; + break; + } + if (page_index > next) next = page_index; next++; @@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) next = start; continue; } + if (pvec.pages[0]->index > end) { + pagevec_release(&pvec); + break; + } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; + if (page->index > end) + break; lock_page(page); wait_on_page_writeback(page); if (page->index > next) @@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) pagevec_release(&pvec); } } +EXPORT_SYMBOL(truncate_inode_pages_range); +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * + * Called under (and serialised by) inode->i_sem. + */ +void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +{ + truncate_inode_pages_range(mapping, lstart, (loff_t)-1); +} EXPORT_SYMBOL(truncate_inode_pages); /** diff --git a/mm/vmscan.c b/mm/vmscan.c index b0cd81c32de6..be8235fb1939 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -63,9 +63,6 @@ struct scan_control { unsigned long nr_mapped; /* From page_state */ - /* How many pages shrink_cache() should reclaim */ - int nr_to_reclaim; - /* Ask shrink_caches, or shrink_zone to scan at this priority */ unsigned int priority; @@ -74,9 +71,6 @@ struct scan_control { int may_writepage; - /* Can pages be swapped as part of reclaim? */ - int may_swap; - /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. * In this context, it doesn't matter that we scan the @@ -367,7 +361,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) res = mapping->a_ops->writepage(page, &wbc); if (res < 0) handle_write_error(mapping, page, res); - if (res == WRITEPAGE_ACTIVATE) { + if (res == AOP_WRITEPAGE_ACTIVATE) { ClearPageReclaim(page); return PAGE_ACTIVATE; } @@ -430,8 +424,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { - if (!sc->may_swap) - goto keep_locked; if (!add_to_swap(page)) goto activate_locked; } @@ -653,17 +645,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) goto done; max_scan -= nr_scan; - if (current_is_kswapd()) - mod_page_state_zone(zone, pgscan_kswapd, nr_scan); - else - mod_page_state_zone(zone, pgscan_direct, nr_scan); nr_freed = shrink_list(&page_list, sc); - if (current_is_kswapd()) - mod_page_state(kswapd_steal, nr_freed); - mod_page_state_zone(zone, pgsteal, nr_freed); - sc->nr_to_reclaim -= nr_freed; - spin_lock_irq(&zone->lru_lock); + local_irq_disable(); + if (current_is_kswapd()) { + __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); + __mod_page_state(kswapd_steal, nr_freed); + } else + __mod_page_state_zone(zone, pgscan_direct, nr_scan); + __mod_page_state_zone(zone, pgsteal, nr_freed); + + spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. */ @@ -825,11 +817,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } } zone->nr_active += pgmoved; - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); + spin_unlock(&zone->lru_lock); + + __mod_page_state_zone(zone, pgrefill, pgscanned); + __mod_page_state(pgdeactivate, pgdeactivate); + local_irq_enable(); - mod_page_state_zone(zone, pgrefill, pgscanned); - mod_page_state(pgdeactivate, pgdeactivate); + pagevec_release(&pvec); } /* @@ -861,8 +855,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) else nr_inactive = 0; - sc->nr_to_reclaim = sc->swap_cluster_max; - while (nr_active || nr_inactive) { if (nr_active) { sc->nr_to_scan = min(nr_active, @@ -876,8 +868,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) (unsigned long)sc->swap_cluster_max); nr_inactive -= sc->nr_to_scan; shrink_cache(zone, sc); - if (sc->nr_to_reclaim <= 0) - break; } } @@ -910,7 +900,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc) for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; - if (zone->present_pages == 0) + if (!populated_zone(zone)) continue; if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) @@ -952,7 +942,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) sc.gfp_mask = gfp_mask; sc.may_writepage = 0; - sc.may_swap = 1; inc_page_state(allocstall); @@ -1055,7 +1044,6 @@ loop_again: total_reclaimed = 0; sc.gfp_mask = GFP_KERNEL; sc.may_writepage = 0; - sc.may_swap = 1; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1084,7 +1072,7 @@ loop_again: for (i = pgdat->nr_zones - 1; i >= 0; i--) { struct zone *zone = pgdat->node_zones + i; - if (zone->present_pages == 0) + if (!populated_zone(zone)) continue; if (zone->all_unreclaimable && @@ -1121,7 +1109,7 @@ scan: struct zone *zone = pgdat->node_zones + i; int nr_slab; - if (zone->present_pages == 0) + if (!populated_zone(zone)) continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) @@ -1273,7 +1261,7 @@ void wakeup_kswapd(struct zone *zone, int order) { pg_data_t *pgdat; - if (zone->present_pages == 0) + if (!populated_zone(zone)) return; pgdat = zone->zone_pgdat; @@ -1353,76 +1341,3 @@ static int __init kswapd_init(void) } module_init(kswapd_init) - - -/* - * Try to free up some pages from this zone through reclaim. - */ -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) -{ - struct scan_control sc; - int nr_pages = 1 << order; - int total_reclaimed = 0; - - /* The reclaim may sleep, so don't do it if sleep isn't allowed */ - if (!(gfp_mask & __GFP_WAIT)) - return 0; - if (zone->all_unreclaimable) - return 0; - - sc.gfp_mask = gfp_mask; - sc.may_writepage = 0; - sc.may_swap = 0; - sc.nr_mapped = read_page_state(nr_mapped); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - /* scan at the highest priority */ - sc.priority = 0; - disable_swap_token(); - - if (nr_pages > SWAP_CLUSTER_MAX) - sc.swap_cluster_max = nr_pages; - else - sc.swap_cluster_max = SWAP_CLUSTER_MAX; - - /* Don't reclaim the zone if there are other reclaimers active */ - if (atomic_read(&zone->reclaim_in_progress) > 0) - goto out; - - shrink_zone(zone, &sc); - total_reclaimed = sc.nr_reclaimed; - - out: - return total_reclaimed; -} - -asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone, - unsigned int state) -{ - struct zone *z; - int i; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (node >= MAX_NUMNODES || !node_online(node)) - return -EINVAL; - - /* This will break if we ever add more zones */ - if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM))) - return -EINVAL; - - for (i = 0; i < MAX_NR_ZONES; i++) { - if (!(zone & 1<<i)) - continue; - - z = &NODE_DATA(node)->node_zones[i]; - - if (state) - z->reclaim_pages = 1; - else - z->reclaim_pages = 0; - } - - return 0; -} |