summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/bootmem.c60
-rw-r--r--mm/filemap.c78
-rw-r--r--mm/hugetlb.c192
-rw-r--r--mm/internal.h21
-rw-r--r--mm/madvise.c35
-rw-r--r--mm/memory.c104
-rw-r--r--mm/memory_hotplug.c3
-rw-r--r--mm/mempolicy.c106
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c345
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c57
-rw-r--r--mm/shmem.c36
-rw-r--r--mm/swap.c27
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c20
-rw-r--r--mm/tiny-shmem.c29
-rw-r--r--mm/truncate.c44
-rw-r--r--mm/vmscan.c125
23 files changed, 771 insertions, 553 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 21eb51d4da8f..b3db11f137e0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
config FLATMEM_MANUAL
bool "Flat Memory"
- depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
+ depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
help
This option allows you to change some of the ways that
Linux manages its memory internally. Most users will
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e8c567177dcf..35c32290f717 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -204,6 +204,8 @@ restart_scan:
unsigned long j;
i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
i = ALIGN(i, incr);
+ if (i >= eidx)
+ break;
if (test_bit(i, bdata->node_bootmem_map))
continue;
for (j = i + 1; j < i + areasize; ++j) {
@@ -294,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
unsigned long v = ~map[i / BITS_PER_LONG];
if (gofast && v == ~0UL) {
- int j, order;
+ int order;
page = pfn_to_page(pfn);
count += BITS_PER_LONG;
- __ClearPageReserved(page);
order = ffs(BITS_PER_LONG) - 1;
- set_page_refs(page, order);
- for (j = 1; j < BITS_PER_LONG; j++) {
- if (j + 16 < BITS_PER_LONG)
- prefetchw(page + j + 16);
- __ClearPageReserved(page + j);
- set_page_count(page + j, 0);
- }
- __free_pages(page, order);
+ __free_pages_bootmem(page, order);
i += BITS_PER_LONG;
page += BITS_PER_LONG;
} else if (v) {
@@ -317,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
for (m = 1; m && i < idx; m<<=1, page++, i++) {
if (v & m) {
count++;
- __ClearPageReserved(page);
- set_page_refs(page, 0);
- __free_page(page);
+ __free_pages_bootmem(page, 0);
}
}
} else {
@@ -337,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
count = 0;
for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
count++;
- __ClearPageReserved(page);
- set_page_count(page, 1);
- __free_page(page);
+ __free_pages_bootmem(page, 0);
}
total += count;
bdata->node_bootmem_map = NULL;
@@ -391,15 +381,14 @@ unsigned long __init free_all_bootmem (void)
return(free_all_bootmem_core(NODE_DATA(0)));
}
-void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal,
- unsigned long limit)
+void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
{
pg_data_t *pgdat = pgdat_list;
void *ptr;
for_each_pgdat(pgdat)
if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
- align, goal, limit)))
+ align, goal, 0)))
return(ptr);
/*
@@ -411,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
}
-void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
+ unsigned long goal)
{
void *ptr;
- ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit);
+ ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
if (ptr)
return (ptr);
- return __alloc_bootmem_limit(size, align, goal, limit);
+ return __alloc_bootmem(size, align, goal);
+}
+
+#define LOW32LIMIT 0xffffffff
+
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
+{
+ pg_data_t *pgdat = pgdat_list;
+ void *ptr;
+
+ for_each_pgdat(pgdat)
+ if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+ align, goal, LOW32LIMIT)))
+ return(ptr);
+
+ /*
+ * Whoops, we cannot satisfy the allocation request.
+ */
+ printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of low memory");
+ return NULL;
}
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
+}
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bfde158..4ef24a397684 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -555,11 +555,12 @@ repeat:
page_cache_get(page);
if (TestSetPageLocked(page)) {
read_unlock_irq(&mapping->tree_lock);
- lock_page(page);
+ __lock_page(page);
read_lock_irq(&mapping->tree_lock);
/* Has the page been truncated while we slept? */
- if (page->mapping != mapping || page->index != offset) {
+ if (unlikely(page->mapping != mapping ||
+ page->index != offset)) {
unlock_page(page);
page_cache_release(page);
goto repeat;
@@ -831,8 +832,13 @@ readpage:
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
- if (unlikely(error))
+ if (unlikely(error)) {
+ if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto find_page;
+ }
goto readpage_error;
+ }
if (!PageUptodate(page)) {
lock_page(page);
@@ -1152,26 +1158,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
{
struct address_space *mapping = file->f_mapping;
struct page *page;
- int error;
+ int ret;
- page = page_cache_alloc_cold(mapping);
- if (!page)
- return -ENOMEM;
+ do {
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ return -ENOMEM;
+
+ ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ if (ret == 0)
+ ret = mapping->a_ops->readpage(file, page);
+ else if (ret == -EEXIST)
+ ret = 0; /* losing race to add is OK */
- error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
- if (!error) {
- error = mapping->a_ops->readpage(file, page);
page_cache_release(page);
- return error;
- }
- /*
- * We arrive here in the unlikely event that someone
- * raced with us and added our page to the cache first
- * or we are out of memory for radix-tree nodes.
- */
- page_cache_release(page);
- return error == -EEXIST ? 0 : error;
+ } while (ret == AOP_TRUNCATED_PAGE);
+
+ return ret;
}
#define MMAP_LOTSAMISS (100)
@@ -1331,10 +1335,14 @@ page_not_uptodate:
goto success;
}
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1358,10 +1366,14 @@ page_not_uptodate:
goto success;
}
ClearPageError(page);
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1444,10 +1456,14 @@ page_not_uptodate:
goto success;
}
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1470,10 +1486,14 @@ page_not_uptodate:
}
ClearPageError(page);
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
@@ -1934,12 +1954,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
status = a_ops->prepare_write(file, page, offset, offset+bytes);
if (unlikely(status)) {
loff_t isize = i_size_read(inode);
+
+ if (status != AOP_TRUNCATED_PAGE)
+ unlock_page(page);
+ page_cache_release(page);
+ if (status == AOP_TRUNCATED_PAGE)
+ continue;
/*
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again.
*/
- unlock_page(page);
- page_cache_release(page);
if (pos + bytes > isize)
vmtruncate(inode, isize);
break;
@@ -1952,6 +1976,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
cur_iov, iov_base, bytes);
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
+ if (status == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ continue;
+ }
if (likely(copied > 0)) {
if (!status)
status = copied;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3e52df7c471b..f4c43d7980ba 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,8 @@
#include <linux/highmem.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -36,18 +38,21 @@ static void enqueue_huge_page(struct page *page)
free_huge_pages_node[nid]++;
}
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+ unsigned long address)
{
int nid = numa_node_id();
struct page *page = NULL;
+ struct zonelist *zonelist = huge_zonelist(vma, address);
+ struct zone **z;
- if (list_empty(&hugepage_freelists[nid])) {
- for (nid = 0; nid < MAX_NUMNODES; ++nid)
- if (!list_empty(&hugepage_freelists[nid]))
- break;
+ for (z = zonelist->zones; *z; z++) {
+ nid = (*z)->zone_pgdat->node_id;
+ if (!list_empty(&hugepage_freelists[nid]))
+ break;
}
- if (nid >= 0 && nid < MAX_NUMNODES &&
- !list_empty(&hugepage_freelists[nid])) {
+
+ if (*z) {
page = list_entry(hugepage_freelists[nid].next,
struct page, lru);
list_del(&page->lru);
@@ -85,13 +90,13 @@ void free_huge_page(struct page *page)
spin_unlock(&hugetlb_lock);
}
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
int i;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page();
+ page = dequeue_huge_page(vma, addr);
if (!page) {
spin_unlock(&hugetlb_lock);
return NULL;
@@ -194,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
spin_lock(&hugetlb_lock);
try_to_free_low(count);
while (count < nr_huge_pages) {
- struct page *page = dequeue_huge_page();
+ struct page *page = dequeue_huge_page(NULL, 0);
if (!page)
break;
update_and_free_page(page);
@@ -261,11 +266,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
.nopage = hugetlb_nopage,
};
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable)
{
pte_t entry;
- if (vma->vm_flags & VM_WRITE) {
+ if (writable) {
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
} else {
@@ -277,12 +283,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
return entry;
}
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t entry;
+
+ entry = pte_mkwrite(pte_mkdirty(*ptep));
+ ptep_set_access_flags(vma, address, ptep, entry, 1);
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
+}
+
+
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
+ int cow;
+
+ cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
src_pte = huge_pte_offset(src, addr);
@@ -294,6 +315,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
if (!pte_none(*src_pte)) {
+ if (cow)
+ ptep_set_wrprotect(src, addr, src_pte);
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
@@ -345,57 +368,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
flush_tlb_range(vma, start, end);
}
-static struct page *find_lock_huge_page(struct address_space *mapping,
- unsigned long idx)
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, pte_t pte)
{
- struct page *page;
- int err;
- struct inode *inode = mapping->host;
- unsigned long size;
+ struct page *old_page, *new_page;
+ int i, avoidcopy;
-retry:
- page = find_lock_page(mapping, idx);
- if (page)
- goto out;
+ old_page = pte_page(pte);
- /* Check to make sure the mapping hasn't been truncated */
- size = i_size_read(inode) >> HPAGE_SHIFT;
- if (idx >= size)
- goto out;
+ /* If no-one else is actually using this page, avoid the copy
+ * and just make the page writable */
+ avoidcopy = (page_count(old_page) == 1);
+ if (avoidcopy) {
+ set_huge_ptep_writable(vma, address, ptep);
+ return VM_FAULT_MINOR;
+ }
- if (hugetlb_get_quota(mapping))
- goto out;
- page = alloc_huge_page();
- if (!page) {
- hugetlb_put_quota(mapping);
- goto out;
+ page_cache_get(old_page);
+ new_page = alloc_huge_page(vma, address);
+
+ if (!new_page) {
+ page_cache_release(old_page);
+
+ /* Logically this is OOM, not a SIGBUS, but an OOM
+ * could cause the kernel to go killing other
+ * processes which won't help the hugepage situation
+ * at all (?) */
+ return VM_FAULT_SIGBUS;
}
- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
- if (err) {
- put_page(page);
- hugetlb_put_quota(mapping);
- if (err == -EEXIST)
- goto retry;
- page = NULL;
+ spin_unlock(&mm->page_table_lock);
+ for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+ copy_user_highpage(new_page + i, old_page + i,
+ address + i*PAGE_SIZE);
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+ if (likely(pte_same(*ptep, pte))) {
+ /* Break COW */
+ set_huge_pte_at(mm, address, ptep,
+ make_huge_pte(vma, new_page, 1));
+ /* Make the old page be freed below */
+ new_page = old_page;
}
-out:
- return page;
+ page_cache_release(new_page);
+ page_cache_release(old_page);
+ return VM_FAULT_MINOR;
}
-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access)
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, int write_access)
{
int ret = VM_FAULT_SIGBUS;
unsigned long idx;
unsigned long size;
- pte_t *pte;
struct page *page;
struct address_space *mapping;
-
- pte = huge_pte_alloc(mm, address);
- if (!pte)
- goto out;
+ pte_t new_pte;
mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -405,9 +434,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Use page lock to guard against racing truncation
* before we get page_table_lock.
*/
- page = find_lock_huge_page(mapping, idx);
- if (!page)
- goto out;
+retry:
+ page = find_lock_page(mapping, idx);
+ if (!page) {
+ if (hugetlb_get_quota(mapping))
+ goto out;
+ page = alloc_huge_page(vma, address);
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ goto out;
+ }
+
+ if (vma->vm_flags & VM_SHARED) {
+ int err;
+
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ goto out;
+ }
+ } else
+ lock_page(page);
+ }
spin_lock(&mm->page_table_lock);
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
@@ -415,11 +466,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto backout;
ret = VM_FAULT_MINOR;
- if (!pte_none(*pte))
+ if (!pte_none(*ptep))
goto backout;
add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+ new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ && (vma->vm_flags & VM_SHARED)));
+ set_huge_pte_at(mm, address, ptep, new_pte);
+
+ if (write_access && !(vma->vm_flags & VM_SHARED)) {
+ /* Optimization, do the COW without a second fault */
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+ }
+
spin_unlock(&mm->page_table_lock);
unlock_page(page);
out:
@@ -433,6 +492,33 @@ backout:
goto out;
}
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ pte_t entry;
+ int ret;
+
+ ptep = huge_pte_alloc(mm, address);
+ if (!ptep)
+ return VM_FAULT_OOM;
+
+ entry = *ptep;
+ if (pte_none(entry))
+ return hugetlb_no_page(mm, vma, address, ptep, write_access);
+
+ ret = VM_FAULT_MINOR;
+
+ spin_lock(&mm->page_table_lock);
+ /* Check for a racing update before calling hugetlb_cow */
+ if (likely(pte_same(entry, *ptep)))
+ if (write_access && !pte_write(entry))
+ ret = hugetlb_cow(mm, vma, address, ptep, entry);
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i)
diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e8fb3d..17256bb2f4ef 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,22 @@
* 2 of the License, or (at your option) any later version.
*/
-/* page_alloc.c */
-extern void set_page_refs(struct page *page, int order);
+static inline void set_page_refs(struct page *page, int order)
+{
+#ifdef CONFIG_MMU
+ set_page_count(page, 1);
+#else
+ int i;
+
+ /*
+ * We need to reference all the pages for this order, otherwise if
+ * anyone accesses one of the pages with (get/put) it will be freed.
+ * - eg: access_process_vm()
+ */
+ for (i = 0; i < (1 << order); i++)
+ set_page_count(page + i, 1);
+#endif /* CONFIG_MMU */
+}
+
+extern void fastcall __init __free_pages_bootmem(struct page *page,
+ unsigned int order);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2b7cf0400a21..ae0ae3ea299a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
return 0;
}
+/*
+ * Application wants to free up the pages and associated backing store.
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_remove(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ struct address_space *mapping;
+ loff_t offset, endoff;
+
+ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+ return -EINVAL;
+
+ if (!vma->vm_file || !vma->vm_file->f_mapping
+ || !vma->vm_file->f_mapping->host) {
+ return -EINVAL;
+ }
+
+ mapping = vma->vm_file->f_mapping;
+
+ offset = (loff_t)(start - vma->vm_start)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ endoff = (loff_t)(end - vma->vm_start - 1)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ return vmtruncate_range(mapping->host, offset, endoff);
+}
+
static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_RANDOM:
error = madvise_behavior(vma, prev, start, end, behavior);
break;
+ case MADV_REMOVE:
+ error = madvise_remove(vma, start, end);
+ break;
case MADV_WILLNEED:
error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
* some pages ahead.
* MADV_DONTNEED - the application is finished with the given range,
* so the kernel can free resources associated with it.
+ * MADV_REMOVE - the application wants to free up the given range of
+ * pages and associated backing store.
*
* return values:
* zero - success
diff --git a/mm/memory.c b/mm/memory.c
index aa8af0e20269..7197f9bcd384 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -349,6 +349,11 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
dump_stack();
}
+static inline int is_cow_mapping(unsigned int flags)
+{
+ return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+}
+
/*
* This function gets the "struct page" associated with a pte.
*
@@ -377,6 +382,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
}
/*
@@ -437,7 +444,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
- if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) {
+ if (is_cow_mapping(vm_flags)) {
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = *src_pte;
}
@@ -567,7 +574,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
- if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
+ if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
if (!vma->anon_vma)
return 0;
}
@@ -1002,7 +1009,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
continue;
}
- if (!vma || (vma->vm_flags & VM_IO)
+ if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
|| !(vm_flags & vma->vm_flags))
return i ? : -EFAULT;
@@ -1221,55 +1228,12 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *
return -EFAULT;
if (!page_count(page))
return -EINVAL;
+ vma->vm_flags |= VM_INSERTPAGE;
return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);
/*
- * Somebody does a pfn remapping that doesn't actually work as a vma.
- *
- * Do it as individual pages instead, and warn about it. It's bad form,
- * and very inefficient.
- */
-static int incomplete_pfn_remap(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long pfn, pgprot_t prot)
-{
- static int warn = 10;
- struct page *page;
- int retval;
-
- if (!(vma->vm_flags & VM_INCOMPLETE)) {
- if (warn) {
- warn--;
- printk("%s does an incomplete pfn remapping", current->comm);
- dump_stack();
- }
- }
- vma->vm_flags |= VM_INCOMPLETE | VM_IO | VM_RESERVED;
-
- if (start < vma->vm_start || end > vma->vm_end)
- return -EINVAL;
-
- if (!pfn_valid(pfn))
- return -EINVAL;
-
- page = pfn_to_page(pfn);
- if (!PageReserved(page))
- return -EINVAL;
-
- retval = 0;
- while (start < end) {
- retval = insert_page(vma->vm_mm, start, page, prot);
- if (retval < 0)
- break;
- start += PAGE_SIZE;
- page++;
- }
- return retval;
-}
-
-/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
* in null mappings (currently treated as "copy-on-access")
@@ -1343,9 +1307,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
struct mm_struct *mm = vma->vm_mm;
int err;
- if (addr != vma->vm_start || end != vma->vm_end)
- return incomplete_pfn_remap(vma, addr, end, pfn, prot);
-
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
@@ -1359,9 +1320,18 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
* VM_PFNMAP tells the core MM that the base pages are just
* raw PFN mappings, and do not have a "struct page" associated
* with them.
+ *
+ * There's a horrible special case to handle copy-on-write
+ * behaviour that some programs depend on. We mark the "original"
+ * un-COW'ed pages by matching them up with "vma->vm_pgoff".
*/
+ if (is_cow_mapping(vma->vm_flags)) {
+ if (addr != vma->vm_start || end != vma->vm_end)
+ return -EINVAL;
+ vma->vm_pgoff = pfn;
+ }
+
vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
- vma->vm_pgoff = pfn;
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
@@ -1528,7 +1498,7 @@ gotten:
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
lru_cache_add_active(new_page);
- page_add_anon_rmap(new_page, vma, address);
+ page_add_new_anon_rmap(new_page, vma, address);
/* Free the old page.. */
new_page = old_page;
@@ -1800,9 +1770,32 @@ out_big:
out_busy:
return -ETXTBSY;
}
-
EXPORT_SYMBOL(vmtruncate);
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * If the underlying filesystem is not going to provide
+ * a way to truncate a range of blocks (punch a hole) -
+ * we should return failure right now.
+ */
+ if (!inode->i_op || !inode->i_op->truncate_range)
+ return -ENOSYS;
+
+ down(&inode->i_sem);
+ down_write(&inode->i_alloc_sem);
+ unmap_mapping_range(mapping, offset, (end - offset), 1);
+ truncate_inode_pages_range(mapping, offset, end);
+ inode->i_op->truncate_range(inode, offset, end);
+ up_write(&inode->i_alloc_sem);
+ up(&inode->i_sem);
+
+ return 0;
+}
+EXPORT_SYMBOL(vmtruncate_range);
+
/*
* Primitive swap readahead code. We simply read an aligned block of
* (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1984,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto release;
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
- SetPageReferenced(page);
- page_add_anon_rmap(page, vma, address);
+ page_add_new_anon_rmap(page, vma, address);
} else {
/* Map the ZERO_PAGE - vm_page_prot is readonly */
page = ZERO_PAGE(address);
@@ -2116,7 +2108,7 @@ retry:
if (anon) {
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(new_page);
- page_add_anon_rmap(new_page, vma, address);
+ page_add_new_anon_rmap(new_page, vma, address);
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(new_page);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 431a64f021c0..a918f77f02f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
int nr_pages);
static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
{
- struct pglist_data *pgdat = zone->zone_pgdat;
int nr_pages = PAGES_PER_SECTION;
int ret;
@@ -104,7 +103,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
pgdat->node_start_pfn = start_pfn;
if (end_pfn > old_pgdat_end_pfn)
- pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages;
+ pgdat->node_spanned_pages = end_pfn - pgdat->node_start_pfn;
}
int online_pages(unsigned long pfn, unsigned long nr_pages)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bec88c81244e..0f1d2b8a952b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,7 +93,7 @@ static kmem_cache_t *sn_cache;
/* Highest zone. An specific allocation for a zone below that is not
policied. */
-static int policy_zone;
+int policy_zone = ZONE_DMA;
struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +131,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
if (!zl)
return NULL;
num = 0;
- for_each_node_mask(nd, *nodes) {
- int k;
- for (k = MAX_NR_ZONES-1; k >= 0; k--) {
- struct zone *z = &NODE_DATA(nd)->node_zones[k];
- if (!z->present_pages)
- continue;
- zl->zones[num++] = z;
- if (k > policy_zone)
- policy_zone = k;
- }
- }
+ for_each_node_mask(nd, *nodes)
+ zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
zl->zones[num] = NULL;
return zl;
}
@@ -161,6 +152,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
switch (mode) {
case MPOL_INTERLEAVE:
policy->v.nodes = *nodes;
+ if (nodes_weight(*nodes) == 0) {
+ kmem_cache_free(policy_cache, policy);
+ return ERR_PTR(-EINVAL);
+ }
break;
case MPOL_PREFERRED:
policy->v.preferred_node = first_node(*nodes);
@@ -781,6 +776,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
return nid;
}
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+ struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+ if (vma) {
+ unsigned long off;
+
+ off = vma->vm_pgoff;
+ off += (addr - vma->vm_start) >> shift;
+ return offset_il_node(pol, vma, off);
+ } else
+ return interleave_nodes(pol);
+}
+
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol = get_vma_policy(current, vma, addr);
+
+ if (pol->policy == MPOL_INTERLEAVE) {
+ unsigned nid;
+
+ nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+ return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+ }
+ return zonelist_policy(GFP_HIGHUSER, pol);
+}
+
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -829,15 +852,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
unsigned nid;
- if (vma) {
- unsigned long off;
- off = vma->vm_pgoff;
- off += (addr - vma->vm_start) >> PAGE_SHIFT;
- nid = offset_il_node(pol, vma, off);
- } else {
- /* fall back to process interleaving */
- nid = interleave_nodes(pol);
- }
+
+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
return alloc_page_interleave(gfp, 0, nid);
}
return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -936,54 +952,6 @@ void __mpol_free(struct mempolicy *p)
}
/*
- * Hugetlb policy. Same as above, just works with node numbers instead of
- * zonelists.
- */
-
-/* Find first node suitable for an allocation */
-int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
-{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
-
- switch (pol->policy) {
- case MPOL_DEFAULT:
- return numa_node_id();
- case MPOL_BIND:
- return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
- case MPOL_INTERLEAVE:
- return interleave_nodes(pol);
- case MPOL_PREFERRED:
- return pol->v.preferred_node >= 0 ?
- pol->v.preferred_node : numa_node_id();
- }
- BUG();
- return 0;
-}
-
-/* Find secondary valid nodes for an allocation */
-int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
-{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
-
- switch (pol->policy) {
- case MPOL_PREFERRED:
- case MPOL_DEFAULT:
- case MPOL_INTERLEAVE:
- return 1;
- case MPOL_BIND: {
- struct zone **z;
- for (z = pol->v.zonelist->zones; *z; z++)
- if ((*z)->zone_pgdat->node_id == nid)
- return 1;
- return 0;
- }
- default:
- BUG();
- return 0;
- }
-}
-
-/*
* Shared memory backing store policy support.
*
* Remember policies even when nobody has shared memory mapped.
diff --git a/mm/mmap.c b/mm/mmap.c
index 11ca5927d5ff..64ba4dbcb7de 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -611,7 +611,7 @@ again: remove_next = 1 + (end > next->vm_end);
* If the vma has a ->close operation then the driver probably needs to release
* per-vma resources, so we don't attempt to merge those.
*/
-#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
+#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
diff --git a/mm/mremap.c b/mm/mremap.c
index b535438c363c..ddaeee9a0b69 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -323,7 +323,7 @@ unsigned long do_mremap(unsigned long addr,
/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
goto out;
- if (vma->vm_flags & VM_DONTEXPAND) {
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
if (new_len > old_len)
goto out;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index c1196812876b..c10262d68232 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
{
return 0;
}
+
+struct page *filemap_nopage(struct vm_area_struct *area,
+ unsigned long address, int *type)
+{
+ BUG();
+ return NULL;
+}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0166ea15c9ee..5240e426c1f7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -550,11 +550,17 @@ void __init page_writeback_init(void)
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
+ int ret;
+
if (wbc->nr_to_write <= 0)
return 0;
+ wbc->for_writepages = 1;
if (mapping->a_ops->writepages)
- return mapping->a_ops->writepages(mapping, wbc);
- return generic_writepages(mapping, wbc);
+ ret = mapping->a_ops->writepages(mapping, wbc);
+ else
+ ret = generic_writepages(mapping, wbc);
+ wbc->for_writepages = 0;
+ return ret;
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3b21a13d841c..fd47494cb989 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -53,6 +54,8 @@ unsigned long totalram_pages __read_mostly;
unsigned long totalhigh_pages __read_mostly;
long nr_swap_pages;
+static void fastcall free_hot_cold_page(struct page *page, int cold);
+
/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -81,6 +84,7 @@ int min_free_kbytes = 1024;
unsigned long __initdata nr_kernel_pages;
unsigned long __initdata nr_all_pages;
+#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
int ret = 0;
@@ -122,16 +126,23 @@ static int bad_range(struct zone *zone, struct page *page)
return 0;
}
-static void bad_page(const char *function, struct page *page)
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
+{
+ return 0;
+}
+#endif
+
+static void bad_page(struct page *page)
{
- printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
- function, current->comm, page);
- printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
- (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
- page->mapping, page_mapcount(page), page_count(page));
- printk(KERN_EMERG "Backtrace:\n");
+ printk(KERN_EMERG "Bad page state in process '%s'\n"
+ "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
+ "Trying to fix it up, but a reboot is needed\n"
+ "Backtrace:\n",
+ current->comm, page, (int)(2*sizeof(unsigned long)),
+ (unsigned long)page->flags, page->mapping,
+ page_mapcount(page), page_count(page));
dump_stack();
- printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
page->flags &= ~(1 << PG_lru |
1 << PG_private |
1 << PG_locked |
@@ -184,19 +195,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
int i;
int nr_pages = 1 << order;
- if (!PageCompound(page))
- return;
-
- if (page[1].index != order)
- bad_page(__FUNCTION__, page);
+ if (unlikely(page[1].index != order))
+ bad_page(page);
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
- if (!PageCompound(p))
- bad_page(__FUNCTION__, page);
- if (page_private(p) != (unsigned long)page)
- bad_page(__FUNCTION__, page);
+ if (unlikely(!PageCompound(p) |
+ (page_private(p) != (unsigned long)page)))
+ bad_page(page);
ClearPageCompound(p);
}
}
@@ -255,14 +262,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
- * (a) the buddy is free &&
- * (b) the buddy is on the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (a) the buddy is not in a hole &&
+ * (b) the buddy is free &&
+ * (c) the buddy is on the buddy system &&
+ * (d) a page and its buddy have the same order.
* for recording page's order, we use page_private(page) and PG_private.
*
*/
static inline int page_is_buddy(struct page *page, int order)
{
+#ifdef CONFIG_HOLES_IN_ZONE
+ if (!pfn_valid(page_to_pfn(page)))
+ return 0;
+#endif
+
if (PagePrivate(page) &&
(page_order(page) == order) &&
page_count(page) == 0)
@@ -300,7 +313,7 @@ static inline void __free_pages_bulk (struct page *page,
unsigned long page_idx;
int order_size = 1 << order;
- if (unlikely(order))
+ if (unlikely(PageCompound(page)))
destroy_compound_page(page, order);
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -314,17 +327,15 @@ static inline void __free_pages_bulk (struct page *page,
struct free_area *area;
struct page *buddy;
- combined_idx = __find_combined_index(page_idx, order);
buddy = __page_find_buddy(page, page_idx, order);
-
- if (bad_range(zone, buddy))
- break;
if (!page_is_buddy(buddy, order))
break; /* Move the buddy up one level. */
+
list_del(&buddy->lru);
area = zone->free_area + order;
area->nr_free--;
rmv_page_order(buddy);
+ combined_idx = __find_combined_index(page_idx, order);
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
@@ -334,11 +345,11 @@ static inline void __free_pages_bulk (struct page *page,
zone->free_area[order].nr_free++;
}
-static inline int free_pages_check(const char *function, struct page *page)
+static inline int free_pages_check(struct page *page)
{
- if ( page_mapcount(page) ||
- page->mapping != NULL ||
- page_count(page) != 0 ||
+ if (unlikely(page_mapcount(page) |
+ (page->mapping != NULL) |
+ (page_count(page) != 0) |
(page->flags & (
1 << PG_lru |
1 << PG_private |
@@ -348,8 +359,8 @@ static inline int free_pages_check(const char *function, struct page *page)
1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback |
- 1 << PG_reserved )))
- bad_page(function, page);
+ 1 << PG_reserved ))))
+ bad_page(page);
if (PageDirty(page))
__ClearPageDirty(page);
/*
@@ -375,11 +386,10 @@ static int
free_pages_bulk(struct zone *zone, int count,
struct list_head *list, unsigned int order)
{
- unsigned long flags;
struct page *page = NULL;
int ret = 0;
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
while (!list_empty(list) && count--) {
@@ -389,12 +399,13 @@ free_pages_bulk(struct zone *zone, int count,
__free_pages_bulk(page, zone, order);
ret++;
}
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
return ret;
}
void __free_pages_ok(struct page *page, unsigned int order)
{
+ unsigned long flags;
LIST_HEAD(list);
int i;
int reserved = 0;
@@ -408,14 +419,49 @@ void __free_pages_ok(struct page *page, unsigned int order)
#endif
for (i = 0 ; i < (1 << order) ; ++i)
- reserved += free_pages_check(__FUNCTION__, page + i);
+ reserved += free_pages_check(page + i);
if (reserved)
return;
list_add(&page->lru, &list);
- mod_page_state(pgfree, 1 << order);
kernel_map_pages(page, 1<<order, 0);
+ local_irq_save(flags);
+ __mod_page_state(pgfree, 1 << order);
free_pages_bulk(page_zone(page), 1, &list, order);
+ local_irq_restore(flags);
+}
+
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+ if (order == 0) {
+ __ClearPageReserved(page);
+ set_page_count(page, 0);
+
+ free_hot_cold_page(page, 0);
+ } else {
+ LIST_HEAD(list);
+ int loop;
+
+ for (loop = 0; loop < BITS_PER_LONG; loop++) {
+ struct page *p = &page[loop];
+
+ if (loop + 16 < BITS_PER_LONG)
+ prefetchw(p + 16);
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ }
+
+ arch_free_page(page, order);
+
+ mod_page_state(pgfree, 1 << order);
+
+ list_add(&page->lru, &list);
+ kernel_map_pages(page, 1 << order, 0);
+ free_pages_bulk(page_zone(page), 1, &list, order);
+ }
}
@@ -433,8 +479,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
*
* -- wli
*/
-static inline struct page *
-expand(struct zone *zone, struct page *page,
+static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area)
{
unsigned long size = 1 << high;
@@ -448,24 +493,6 @@ expand(struct zone *zone, struct page *page,
area->nr_free++;
set_page_order(&page[size], high);
}
- return page;
-}
-
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
- set_page_count(page, 1);
-#else
- int i;
-
- /*
- * We need to reference all the pages for this order, otherwise if
- * anyone accesses one of the pages with (get/put) it will be freed.
- * - eg: access_process_vm()
- */
- for (i = 0; i < (1 << order); i++)
- set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
}
/*
@@ -473,9 +500,9 @@ void set_page_refs(struct page *page, int order)
*/
static int prep_new_page(struct page *page, int order)
{
- if ( page_mapcount(page) ||
- page->mapping != NULL ||
- page_count(page) != 0 ||
+ if (unlikely(page_mapcount(page) |
+ (page->mapping != NULL) |
+ (page_count(page) != 0) |
(page->flags & (
1 << PG_lru |
1 << PG_private |
@@ -486,8 +513,8 @@ static int prep_new_page(struct page *page, int order)
1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback |
- 1 << PG_reserved )))
- bad_page(__FUNCTION__, page);
+ 1 << PG_reserved ))))
+ bad_page(page);
/*
* For now, we report if PG_reserved was found set, but do not
@@ -525,7 +552,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
rmv_page_order(page);
area->nr_free--;
zone->free_pages -= 1UL << order;
- return expand(zone, page, order, current_order, area);
+ expand(zone, page, order, current_order, area);
+ return page;
}
return NULL;
@@ -539,21 +567,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list)
{
- unsigned long flags;
int i;
- int allocated = 0;
- struct page *page;
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- page = __rmqueue(zone, order);
- if (page == NULL)
+ struct page *page = __rmqueue(zone, order);
+ if (unlikely(page == NULL))
break;
- allocated++;
list_add_tail(&page->lru, list);
}
- spin_unlock_irqrestore(&zone->lock, flags);
- return allocated;
+ spin_unlock(&zone->lock);
+ return i;
}
#ifdef CONFIG_NUMA
@@ -589,6 +613,7 @@ void drain_remote_pages(void)
#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
+ unsigned long flags;
struct zone *zone;
int i;
@@ -600,8 +625,10 @@ static void __drain_pages(unsigned int cpu)
struct per_cpu_pages *pcp;
pcp = &pset->pcp[i];
+ local_irq_save(flags);
pcp->count -= free_pages_bulk(zone, pcp->count,
&pcp->list, 0);
+ local_irq_restore(flags);
}
}
}
@@ -647,18 +674,14 @@ void drain_local_pages(void)
}
#endif /* CONFIG_PM */
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
{
#ifdef CONFIG_NUMA
- unsigned long flags;
- int cpu;
pg_data_t *pg = z->zone_pgdat;
pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
struct per_cpu_pageset *p;
- local_irq_save(flags);
- cpu = smp_processor_id();
- p = zone_pcp(z,cpu);
+ p = zone_pcp(z, cpu);
if (pg == orig) {
p->numa_hit++;
} else {
@@ -669,14 +692,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
p->local_node++;
else
p->other_node++;
- local_irq_restore(flags);
#endif
}
/*
* Free a 0-order page
*/
-static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
static void fastcall free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
@@ -687,14 +708,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
if (PageAnon(page))
page->mapping = NULL;
- if (free_pages_check(__FUNCTION__, page))
+ if (free_pages_check(page))
return;
- inc_page_state(pgfree);
kernel_map_pages(page, 1, 0);
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
+ __inc_page_state(pgfree);
list_add(&page->lru, &pcp->list);
pcp->count++;
if (pcp->count >= pcp->high)
@@ -727,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
-static struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
+ struct zone *zone, int order, gfp_t gfp_flags)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
+ int cpu;
again:
+ cpu = get_cpu();
if (order == 0) {
struct per_cpu_pages *pcp;
- page = NULL;
- pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+ pcp = &zone_pcp(zone, cpu)->pcp[cold];
local_irq_save(flags);
- if (pcp->count <= pcp->low)
+ if (!pcp->count) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
- if (pcp->count) {
- page = list_entry(pcp->list.next, struct page, lru);
- list_del(&page->lru);
- pcp->count--;
+ if (unlikely(!pcp->count))
+ goto failed;
}
- local_irq_restore(flags);
- put_cpu();
+ page = list_entry(pcp->list.next, struct page, lru);
+ list_del(&page->lru);
+ pcp->count--;
} else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
+ if (!page)
+ goto failed;
}
- if (page != NULL) {
- BUG_ON(bad_range(zone, page));
- mod_page_state_zone(zone, pgalloc, 1 << order);
- if (prep_new_page(page, order))
- goto again;
+ __mod_page_state_zone(zone, pgalloc, 1 << order);
+ zone_statistics(zonelist, zone, cpu);
+ local_irq_restore(flags);
+ put_cpu();
+
+ BUG_ON(bad_range(zone, page));
+ if (prep_new_page(page, order))
+ goto again;
- if (gfp_flags & __GFP_ZERO)
- prep_zero_page(page, order, gfp_flags);
+ if (gfp_flags & __GFP_ZERO)
+ prep_zero_page(page, order, gfp_flags);
- if (order && (gfp_flags & __GFP_COMP))
- prep_compound_page(page, order);
- }
+ if (order && (gfp_flags & __GFP_COMP))
+ prep_compound_page(page, order);
return page;
+
+failed:
+ local_irq_restore(flags);
+ put_cpu();
+ return NULL;
}
#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
@@ -845,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
continue;
}
- page = buffered_rmqueue(*z, order, gfp_mask);
+ page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
if (page) {
- zone_statistics(zonelist, *z);
break;
}
} while (*(++z) != NULL);
@@ -903,8 +932,7 @@ restart:
alloc_flags |= ALLOC_HARDER;
if (gfp_mask & __GFP_HIGH)
alloc_flags |= ALLOC_HIGH;
- if (wait)
- alloc_flags |= ALLOC_CPUSET;
+ alloc_flags |= ALLOC_CPUSET;
/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +954,7 @@ restart:
nofail_alloc:
/* go through the zonelist yet again, ignoring mins */
page = get_page_from_freelist(gfp_mask, order,
- zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+ zonelist, ALLOC_NO_WATERMARKS);
if (page)
goto got_pg;
if (gfp_mask & __GFP_NOFAIL) {
@@ -1171,12 +1199,11 @@ EXPORT_SYMBOL(nr_pagecache);
DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
#endif
-void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
+static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
{
int cpu = 0;
memset(ret, 0, sizeof(*ret));
- cpus_and(*cpumask, *cpumask, cpu_online_map);
cpu = first_cpu(*cpumask);
while (cpu < NR_CPUS) {
@@ -1224,12 +1251,12 @@ void get_full_page_state(struct page_state *ret)
__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
}
-unsigned long __read_page_state(unsigned long offset)
+unsigned long read_page_state_offset(unsigned long offset)
{
unsigned long ret = 0;
int cpu;
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu) {
unsigned long in;
in = (unsigned long)&per_cpu(page_states, cpu) + offset;
@@ -1238,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset)
return ret;
}
-void __mod_page_state(unsigned long offset, unsigned long delta)
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+ void *ptr;
+
+ ptr = &__get_cpu_var(page_states);
+ *(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
{
unsigned long flags;
- void* ptr;
+ void *ptr;
local_irq_save(flags);
ptr = &__get_cpu_var(page_states);
- *(unsigned long*)(ptr + offset) += delta;
+ *(unsigned long *)(ptr + offset) += delta;
local_irq_restore(flags);
}
-
-EXPORT_SYMBOL(__mod_page_state);
+EXPORT_SYMBOL(mod_page_state_offset);
void __get_zone_counts(unsigned long *active, unsigned long *inactive,
unsigned long *free, struct pglist_data *pgdat)
@@ -1335,7 +1370,7 @@ void show_free_areas(void)
show_node(zone);
printk("%s per-cpu:", zone->name);
- if (!zone->present_pages) {
+ if (!populated_zone(zone)) {
printk(" empty\n");
continue;
} else
@@ -1347,10 +1382,9 @@ void show_free_areas(void)
pageset = zone_pcp(zone, cpu);
for (temperature = 0; temperature < 2; temperature++)
- printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
+ printk("cpu %d %s: high %d, batch %d used:%d\n",
cpu,
temperature ? "cold" : "hot",
- pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
pageset->pcp[temperature].batch,
pageset->pcp[temperature].count);
@@ -1413,7 +1447,7 @@ void show_free_areas(void)
show_node(zone);
printk("%s: ", zone->name);
- if (!zone->present_pages) {
+ if (!populated_zone(zone)) {
printk("empty\n");
continue;
}
@@ -1433,36 +1467,29 @@ void show_free_areas(void)
/*
* Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
*/
-static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
-{
- switch (k) {
- struct zone *zone;
- default:
- BUG();
- case ZONE_HIGHMEM:
- zone = pgdat->node_zones + ZONE_HIGHMEM;
- if (zone->present_pages) {
+static int __init build_zonelists_node(pg_data_t *pgdat,
+ struct zonelist *zonelist, int nr_zones, int zone_type)
+{
+ struct zone *zone;
+
+ BUG_ON(zone_type > ZONE_HIGHMEM);
+
+ do {
+ zone = pgdat->node_zones + zone_type;
+ if (populated_zone(zone)) {
#ifndef CONFIG_HIGHMEM
- BUG();
+ BUG_ON(zone_type > ZONE_NORMAL);
#endif
- zonelist->zones[j++] = zone;
+ zonelist->zones[nr_zones++] = zone;
+ check_highest_zone(zone_type);
}
- case ZONE_NORMAL:
- zone = pgdat->node_zones + ZONE_NORMAL;
- if (zone->present_pages)
- zonelist->zones[j++] = zone;
- case ZONE_DMA32:
- zone = pgdat->node_zones + ZONE_DMA32;
- if (zone->present_pages)
- zonelist->zones[j++] = zone;
- case ZONE_DMA:
- zone = pgdat->node_zones + ZONE_DMA;
- if (zone->present_pages)
- zonelist->zones[j++] = zone;
- }
+ zone_type--;
- return j;
+ } while (zone_type >= 0);
+ return nr_zones;
}
static inline int highest_zone(int zone_bits)
@@ -1709,8 +1736,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
if (!early_pfn_valid(pfn))
continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
set_page_count(page, 1);
@@ -1794,14 +1819,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
pcp = &p->pcp[0]; /* hot */
pcp->count = 0;
- pcp->low = 0;
pcp->high = 6 * batch;
pcp->batch = max(1UL, 1 * batch);
INIT_LIST_HEAD(&pcp->list);
pcp = &p->pcp[1]; /* cold*/
pcp->count = 0;
- pcp->low = 0;
pcp->high = 2 * batch;
pcp->batch = max(1UL, batch/2);
INIT_LIST_HEAD(&pcp->list);
@@ -1896,7 +1919,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
static struct notifier_block pageset_notifier =
{ &pageset_cpuup_callback, NULL, 0 };
-void __init setup_per_cpu_pageset()
+void __init setup_per_cpu_pageset(void)
{
int err;
@@ -2116,7 +2139,7 @@ static int frag_show(struct seq_file *m, void *arg)
int order;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!zone->present_pages)
+ if (!populated_zone(zone))
continue;
spin_lock_irqsave(&zone->lock, flags);
@@ -2149,7 +2172,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
int i;
- if (!zone->present_pages)
+ if (!populated_zone(zone))
continue;
spin_lock_irqsave(&zone->lock, flags);
@@ -2197,12 +2220,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
seq_printf(m,
"\n cpu: %i pcp: %i"
"\n count: %i"
- "\n low: %i"
"\n high: %i"
"\n batch: %i",
i, j,
pageset->pcp[j].count,
- pageset->pcp[j].low,
pageset->pcp[j].high,
pageset->pcp[j].batch);
}
@@ -2257,32 +2278,40 @@ static char *vmstat_text[] = {
"pgpgout",
"pswpin",
"pswpout",
- "pgalloc_high",
+ "pgalloc_high",
"pgalloc_normal",
+ "pgalloc_dma32",
"pgalloc_dma",
+
"pgfree",
"pgactivate",
"pgdeactivate",
"pgfault",
"pgmajfault",
+
"pgrefill_high",
"pgrefill_normal",
+ "pgrefill_dma32",
"pgrefill_dma",
"pgsteal_high",
"pgsteal_normal",
+ "pgsteal_dma32",
"pgsteal_dma",
+
"pgscan_kswapd_high",
"pgscan_kswapd_normal",
-
+ "pgscan_kswapd_dma32",
"pgscan_kswapd_dma",
+
"pgscan_direct_high",
"pgscan_direct_normal",
+ "pgscan_direct_dma32",
"pgscan_direct_dma",
- "pginodesteal",
+ "pginodesteal",
"slabs_scanned",
"kswapd_steal",
"kswapd_inodesteal",
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87c7..8d6eeaaa6296 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
{
unsigned page_idx;
struct pagevec lru_pvec;
- int ret = 0;
+ int ret;
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
list_del(&page->lru);
if (!add_to_page_cache(page, mapping,
page->index, GFP_KERNEL)) {
- mapping->a_ops->readpage(filp, page);
- if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add(&lru_pvec);
- } else {
- page_cache_release(page);
+ ret = mapping->a_ops->readpage(filp, page);
+ if (ret != AOP_TRUNCATED_PAGE) {
+ if (!pagevec_add(&lru_pvec, page))
+ __pagevec_lru_add(&lru_pvec);
+ continue;
+ } /* else fall through to release */
}
+ page_cache_release(page);
}
pagevec_lru_add(&lru_pvec);
+ ret = 0;
out:
return ret;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6def159..6f3f7db27128 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked)
}
/**
+ * page_set_anon_rmap - setup new anonymous rmap
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ BUG_ON(!anon_vma);
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+
+ page->index = linear_page_index(vma, address);
+
+ /*
+ * nr_mapped state can be updated without turning off
+ * interrupts because it is not modified via interrupt.
+ */
+ __inc_page_state(nr_mapped);
+}
+
+/**
* page_add_anon_rmap - add pte mapping to an anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
@@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked)
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- if (atomic_inc_and_test(&page->_mapcount)) {
- struct anon_vma *anon_vma = vma->anon_vma;
-
- BUG_ON(!anon_vma);
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- page->mapping = (struct address_space *) anon_vma;
-
- page->index = linear_page_index(vma, address);
-
- inc_page_state(nr_mapped);
- }
+ if (atomic_inc_and_test(&page->_mapcount))
+ __page_set_anon_rmap(page, vma, address);
/* else checking page index and mapping is racy */
}
+/*
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ */
+void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+ __page_set_anon_rmap(page, vma, address);
+}
+
/**
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
@@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page)
BUG_ON(!pfn_valid(page_to_pfn(page)));
if (atomic_inc_and_test(&page->_mapcount))
- inc_page_state(nr_mapped);
+ __inc_page_state(nr_mapped);
}
/**
@@ -495,7 +526,7 @@ void page_remove_rmap(struct page *page)
*/
if (page_test_and_clear_dirty(page))
set_page_dirty(page);
- dec_page_state(nr_mapped);
+ __dec_page_state(nr_mapped);
}
}
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61e9..a1f2f02af724 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
} while (next);
}
-static void shmem_truncate(struct inode *inode)
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
{
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
long nr_swaps_freed = 0;
int offset;
int freed;
+ int punch_hole = 0;
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (idx >= info->next_index)
return;
spin_lock(&info->lock);
info->flags |= SHMEM_TRUNCATE;
- limit = info->next_index;
- info->next_index = idx;
+ if (likely(end == (loff_t) -1)) {
+ limit = info->next_index;
+ info->next_index = idx;
+ } else {
+ limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (limit > info->next_index)
+ limit = info->next_index;
+ punch_hole = 1;
+ }
+
topdir = info->i_indirect;
- if (topdir && idx <= SHMEM_NR_DIRECT) {
+ if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
info->i_indirect = NULL;
nr_pages_to_free++;
list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
set_page_private(subdir, page_private(subdir) - freed);
if (offset)
spin_unlock(&info->lock);
- BUG_ON(page_private(subdir) > offset);
+ if (!punch_hole)
+ BUG_ON(page_private(subdir) > offset);
}
if (offset)
offset = 0;
- else if (subdir) {
+ else if (subdir && !page_private(subdir)) {
dir[diroff] = NULL;
nr_pages_to_free++;
list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
* Also, though shmem_getpage checks i_size before adding to
* cache, no recheck after: so fix the narrow window there too.
*/
- truncate_inode_pages(inode->i_mapping, inode->i_size);
+ truncate_inode_pages_range(inode->i_mapping, start, end);
}
spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
}
}
+static void shmem_truncate(struct inode *inode)
+{
+ shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
+
static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
@@ -855,7 +870,7 @@ unlock:
swap_free(swap);
redirty:
set_page_dirty(page);
- return WRITEPAGE_ACTIVATE; /* Return with the page locked */
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */
}
#ifdef CONFIG_NUMA
@@ -1255,7 +1270,7 @@ out_nomem:
return retval;
}
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = {
static struct inode_operations shmem_inode_operations = {
.truncate = shmem_truncate,
.setattr = shmem_notify_change,
+ .truncate_range = shmem_truncate_range,
};
static struct inode_operations shmem_dir_inode_operations = {
diff --git a/mm/swap.c b/mm/swap.c
index 73d351439ef6..ee6d71ccfa56 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page)
put_cpu_var(lru_add_active_pvecs);
}
-void lru_add_drain(void)
+static void __lru_add_drain(int cpu)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+ struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+ /* CPU is dead, so no locking needed. */
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
- pvec = &__get_cpu_var(lru_add_active_pvecs);
+ pvec = &per_cpu(lru_add_active_pvecs, cpu);
if (pagevec_count(pvec))
__pagevec_lru_add_active(pvec);
- put_cpu_var(lru_add_pvecs);
+}
+
+void lru_add_drain(void)
+{
+ __lru_add_drain(get_cpu());
+ put_cpu();
}
/*
@@ -412,17 +418,6 @@ void vm_acct_memory(long pages)
}
#ifdef CONFIG_HOTPLUG_CPU
-static void lru_drain_cache(unsigned int cpu)
-{
- struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
-
- /* CPU is dead, so no locking needed. */
- if (pagevec_count(pvec))
- __pagevec_lru_add(pvec);
- pvec = &per_cpu(lru_add_active_pvecs, cpu);
- if (pagevec_count(pvec))
- __pagevec_lru_add_active(pvec);
-}
/* Drop the CPU's cached committed space back into the central pool. */
static int cpu_swap_callback(struct notifier_block *nfb,
@@ -435,7 +430,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
if (action == CPU_DEAD) {
atomic_add(*committed, &vm_committed_space);
*committed = 0;
- lru_drain_cache((long)hcpu);
+ __lru_add_drain((long)hcpu);
}
return NOTIFY_OK;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57b1de8..fc2aecb70a95 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
#include <asm/pgtable.h>
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
*/
void free_pages_and_swap_cache(struct page **pages, int nr)
{
- int chunk = 16;
struct page **pagep = pages;
lru_add_drain();
while (nr) {
- int todo = min(chunk, nr);
+ int todo = min(nr, PAGEVEC_SIZE);
int i;
for (i = 0; i < todo; i++)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace301f..6da4b28b896b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -211,6 +211,26 @@ noswap:
return (swp_entry_t) {0};
}
+swp_entry_t get_swap_page_of_type(int type)
+{
+ struct swap_info_struct *si;
+ pgoff_t offset;
+
+ spin_lock(&swap_lock);
+ si = swap_info + type;
+ if (si->flags & SWP_WRITEOK) {
+ nr_swap_pages--;
+ offset = scan_swap_map(si);
+ if (offset) {
+ spin_unlock(&swap_lock);
+ return swp_entry(type, offset);
+ }
+ nr_swap_pages++;
+ }
+ spin_unlock(&swap_lock);
+ return (swp_entry_t) {0};
+}
+
static struct swap_info_struct * swap_info_get(swp_entry_t entry)
{
struct swap_info_struct * p;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44ed6..cdc6d431972b 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
goto close_file;
d_instantiate(dentry, inode);
- inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
+
file->f_vfsmnt = mntget(shm_mnt);
file->f_dentry = dentry;
file->f_mapping = inode->i_mapping;
file->f_op = &ramfs_file_operations;
file->f_mode = FMODE_WRITE | FMODE_READ;
+
+ /* notify everyone as to the change of file size */
+ error = do_truncate(dentry, size, file);
+ if (error < 0)
+ goto close_file;
+
return file;
close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
{
return 0;
}
+
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+#ifndef CONFIG_MMU
+ return ramfs_nommu_mmap(file, vma);
+#else
+ return 0;
+#endif
+}
+
+#ifndef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index 9173ab500604..7dee32745901 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
}
/**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
*
- * Truncate the page cache at a set offset, removing the pages that are beyond
- * that offset (and zeroing out partial pages).
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
*
* Truncate takes two passes - the first pass is nonblocking. It will not
* block on page locks and it will not block on writeback. The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
* We pass down the cache-hot hint to the page freeing code. Even if the
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
*/
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
{
const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+ pgoff_t end;
const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
struct pagevec pvec;
pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
if (mapping->nrpages == 0)
return;
+ BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+ end = (lend >> PAGE_CACHE_SHIFT);
+
pagevec_init(&pvec, 0);
next = start;
- while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (next <= end &&
+ pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index = page->index;
+ if (page_index > end) {
+ next = page_index;
+ break;
+ }
+
if (page_index > next)
next = page_index;
next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
next = start;
continue;
}
+ if (pvec.pages[0]->index > end) {
+ pagevec_release(&pvec);
+ break;
+ }
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
+ if (page->index > end)
+ break;
lock_page(page);
wait_on_page_writeback(page);
if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
pagevec_release(&pvec);
}
}
+EXPORT_SYMBOL(truncate_inode_pages_range);
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+ truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
EXPORT_SYMBOL(truncate_inode_pages);
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c32de6..be8235fb1939 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
unsigned long nr_mapped; /* From page_state */
- /* How many pages shrink_cache() should reclaim */
- int nr_to_reclaim;
-
/* Ask shrink_caches, or shrink_zone to scan at this priority */
unsigned int priority;
@@ -74,9 +71,6 @@ struct scan_control {
int may_writepage;
- /* Can pages be swapped as part of reclaim? */
- int may_swap;
-
/* This context's SWAP_CLUSTER_MAX. If freeing memory for
* suspend, we effectively ignore SWAP_CLUSTER_MAX.
* In this context, it doesn't matter that we scan the
@@ -367,7 +361,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
- if (res == WRITEPAGE_ACTIVATE) {
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
@@ -430,8 +424,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
* Try to allocate it some swap space here.
*/
if (PageAnon(page) && !PageSwapCache(page)) {
- if (!sc->may_swap)
- goto keep_locked;
if (!add_to_swap(page))
goto activate_locked;
}
@@ -653,17 +645,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
goto done;
max_scan -= nr_scan;
- if (current_is_kswapd())
- mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
- else
- mod_page_state_zone(zone, pgscan_direct, nr_scan);
nr_freed = shrink_list(&page_list, sc);
- if (current_is_kswapd())
- mod_page_state(kswapd_steal, nr_freed);
- mod_page_state_zone(zone, pgsteal, nr_freed);
- sc->nr_to_reclaim -= nr_freed;
- spin_lock_irq(&zone->lru_lock);
+ local_irq_disable();
+ if (current_is_kswapd()) {
+ __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+ __mod_page_state(kswapd_steal, nr_freed);
+ } else
+ __mod_page_state_zone(zone, pgscan_direct, nr_scan);
+ __mod_page_state_zone(zone, pgsteal, nr_freed);
+
+ spin_lock(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
@@ -825,11 +817,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
}
}
zone->nr_active += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
+ spin_unlock(&zone->lru_lock);
+
+ __mod_page_state_zone(zone, pgrefill, pgscanned);
+ __mod_page_state(pgdeactivate, pgdeactivate);
+ local_irq_enable();
- mod_page_state_zone(zone, pgrefill, pgscanned);
- mod_page_state(pgdeactivate, pgdeactivate);
+ pagevec_release(&pvec);
}
/*
@@ -861,8 +855,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
else
nr_inactive = 0;
- sc->nr_to_reclaim = sc->swap_cluster_max;
-
while (nr_active || nr_inactive) {
if (nr_active) {
sc->nr_to_scan = min(nr_active,
@@ -876,8 +868,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
(unsigned long)sc->swap_cluster_max);
nr_inactive -= sc->nr_to_scan;
shrink_cache(zone, sc);
- if (sc->nr_to_reclaim <= 0)
- break;
}
}
@@ -910,7 +900,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -952,7 +942,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
sc.gfp_mask = gfp_mask;
sc.may_writepage = 0;
- sc.may_swap = 1;
inc_page_state(allocstall);
@@ -1055,7 +1044,6 @@ loop_again:
total_reclaimed = 0;
sc.gfp_mask = GFP_KERNEL;
sc.may_writepage = 0;
- sc.may_swap = 1;
sc.nr_mapped = read_page_state(nr_mapped);
inc_page_state(pageoutrun);
@@ -1084,7 +1072,7 @@ loop_again:
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
struct zone *zone = pgdat->node_zones + i;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable &&
@@ -1121,7 +1109,7 @@ scan:
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1273,7 +1261,7 @@ void wakeup_kswapd(struct zone *zone, int order)
{
pg_data_t *pgdat;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
return;
pgdat = zone->zone_pgdat;
@@ -1353,76 +1341,3 @@ static int __init kswapd_init(void)
}
module_init(kswapd_init)
-
-
-/*
- * Try to free up some pages from this zone through reclaim.
- */
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
-{
- struct scan_control sc;
- int nr_pages = 1 << order;
- int total_reclaimed = 0;
-
- /* The reclaim may sleep, so don't do it if sleep isn't allowed */
- if (!(gfp_mask & __GFP_WAIT))
- return 0;
- if (zone->all_unreclaimable)
- return 0;
-
- sc.gfp_mask = gfp_mask;
- sc.may_writepage = 0;
- sc.may_swap = 0;
- sc.nr_mapped = read_page_state(nr_mapped);
- sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- /* scan at the highest priority */
- sc.priority = 0;
- disable_swap_token();
-
- if (nr_pages > SWAP_CLUSTER_MAX)
- sc.swap_cluster_max = nr_pages;
- else
- sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-
- /* Don't reclaim the zone if there are other reclaimers active */
- if (atomic_read(&zone->reclaim_in_progress) > 0)
- goto out;
-
- shrink_zone(zone, &sc);
- total_reclaimed = sc.nr_reclaimed;
-
- out:
- return total_reclaimed;
-}
-
-asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
- unsigned int state)
-{
- struct zone *z;
- int i;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- if (node >= MAX_NUMNODES || !node_online(node))
- return -EINVAL;
-
- /* This will break if we ever add more zones */
- if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
- return -EINVAL;
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (!(zone & 1<<i))
- continue;
-
- z = &NODE_DATA(node)->node_zones[i];
-
- if (state)
- z->reclaim_pages = 1;
- else
- z->reclaim_pages = 0;
- }
-
- return 0;
-}