From 511030bcd24119fa3759ef3f914d354e107ef839 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Tue, 28 Feb 2006 16:58:57 -0800
Subject: [PATCH] Fix sys_migrate_pages: Move all pages when invoked from root

Currently sys_migrate_pages only moves pages belonging to a process.  This
is okay when invoked from a regular user.  But if invoked from root it
should move all pages as documented in the migrate_pages manpage.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 67af4cea1e23..5643cfed6b0f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -954,7 +954,8 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		goto out;
 	}
 
-	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+	err = do_migrate_pages(mm, &old, &new,
+		capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
 	mmput(mm);
 	return err;
-- 
cgit v1.2.3


From e8788c0cce63e0cc8689a123d1ce0af1e28cd583 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Tue, 28 Feb 2006 16:59:16 -0800
Subject: [PATCH] remove_from_swap: fix locking

remove_from_swap() currently attempts to use page_lock_anon_vma to obtain
an anon_vma lock.  That is not working since the page may have been
remapped via swap ptes in order to move the page.

However, do_migrate_pages() obtain the mmap_sem lock and therefore there is
a guarantee that the anonymous vma will not vanish from under us.  There is
therefore no need to use page_lock_anon_vma.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index df2c41c2a9a2..d8ce5ff61454 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -212,25 +212,33 @@ out:
  * through real pte's pointing to valid pages and then releasing
  * the page from the swap cache.
  *
- * Must hold page lock on page.
+ * Must hold page lock on page and mmap_sem of one vma that contains
+ * the page.
  */
 void remove_from_swap(struct page *page)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
+	unsigned long mapping;
 
-	if (!PageAnon(page) || !PageSwapCache(page))
+	if (!PageSwapCache(page))
 		return;
 
-	anon_vma = page_lock_anon_vma(page);
-	if (!anon_vma)
+	mapping = (unsigned long)page->mapping;
+
+	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
 		return;
 
+	/*
+	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+	 */
+	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+	spin_lock(&anon_vma->lock);
+
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
 		remove_vma_swap(vma, page);
 
 	spin_unlock(&anon_vma->lock);
-
 	delete_from_swap_cache(page);
 }
 EXPORT_SYMBOL(remove_from_swap);
-- 
cgit v1.2.3


From f61388822a6040ff462c5f7260daa0f1017f2db0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 28 Feb 2006 16:59:18 -0800
Subject: [PATCH] nommu: implement vmalloc_node()

Fix oprofile linkage.   Pointed out by "Luke Yang" <luke.adi@gmail.com>.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/nommu.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 99d21020ec9d..4951f4786f28 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -53,7 +53,6 @@ DECLARE_RWSEM(nommu_vma_sem);
 struct vm_operations_struct generic_file_vm_ops = {
 };
 
-EXPORT_SYMBOL(vmalloc);
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(vmalloc_to_page);
 EXPORT_SYMBOL(vmalloc_32);
@@ -205,6 +204,13 @@ void *vmalloc(unsigned long size)
 {
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
+EXPORT_SYMBOL(vmalloc);
+
+void *vmalloc_node(unsigned long size, int node)
+{
+	return vmalloc(size);
+}
+EXPORT_SYMBOL(vmalloc_node);
 
 /*
  *	vmalloc_32  -  allocate virtually continguos memory (32bit addressable)
-- 
cgit v1.2.3


From d6713e046336ffa98060418c4d2c65243639e107 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 28 Feb 2006 16:59:19 -0800
Subject: [PATCH] out_of_memory(): use of uninitialised

Under some circumstances `points' can get printed before it's initialised.
Spotted by Carlos Martin <carlos@cmartin.tk>.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8123fad5a485..c86c737d2433 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -302,7 +302,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
 	struct mm_struct *mm = NULL;
 	task_t *p;
-	unsigned long points;
+	unsigned long points = 0;
 
 	if (printk_ratelimit()) {
 		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
-- 
cgit v1.2.3


From 140ffcec4def3ee3af7565b2cf1d3b2580f7e180 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 2 Mar 2006 02:54:28 -0800
Subject: [PATCH] out_of_memory() locking fix

I seem to have lost this read_unlock().

While we're there, let's turn that interruptible sleep unto uninterruptible,
so we don't get a busywait if signal_pending().  (Again.  We seem to have a
habit of doing this).

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c86c737d2433..78747afad6b0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -355,6 +355,7 @@ retry:
 	}
 
 out:
+	read_unlock(&tasklist_lock);
 	cpuset_unlock();
 	if (mm)
 		mmput(mm);
@@ -364,5 +365,5 @@ out:
 	 * retry to allocate memory unless "p" is current
 	 */
 	if (!test_thread_flag(TIF_MEMDIE))
-		schedule_timeout_interruptible(1);
+		schedule_timeout_uninterruptible(1);
 }
-- 
cgit v1.2.3


From a57ebfdb2cf9fa60dfa2f403f70ef6c432ca2a62 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Thu, 2 Mar 2006 02:54:37 -0800
Subject: [PATCH] numa_maps: Fix potential crash on non IA64 platforms

numa_maps should not scan over huge vmas in order not to cause problems for
non IA64 platforms that may have pte entries pointing to huge pages in a
variety of ways in their page tables.  Add a simple check to ignore vmas
containing huge pages.

Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5643cfed6b0f..1a210088ad80 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1793,7 +1793,8 @@ int show_numa_map(struct seq_file *m, void *v)
 	if (!md)
 		return 0;
 
-	check_pgd_range(vma, vma->vm_start, vma->vm_end,
+	if (!is_vm_hugetlb_page(vma))
+		check_pgd_range(vma, vma->vm_start, vma->vm_end,
 		    &node_online_map, MPOL_MF_STATS, md);
 
 	if (md->pages) {
-- 
cgit v1.2.3


From 264132bc62fe071d0ff378c1103bae9d33212f10 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 6 Mar 2006 12:10:07 -0800
Subject: Fix "check_slabp" printout size calculation

We want to use the "struct slab" size, not the size of the pointer to
same.  As it is, we'd not print out the last <n> entry pointers in the
slab (where <n> is ~10, depending on whether it's a 32-bit or 64-bit
kernel).

Gaah, that slab code was written by somebody who likes unreadable crud.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index add05d808a4a..2b0b1519bb74 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2554,7 +2554,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
 		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
 		       cachep->name, cachep->num, slabp, slabp->inuse);
 		for (i = 0;
-		     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
 		     i++) {
 			if ((i % 16) == 0)
 				printk("\n%03x:", i);
-- 
cgit v1.2.3


From 9888e6fa7b68d9c8cc2c162a90979825ab45150a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 6 Mar 2006 17:44:43 -0800
Subject: slab: clarify and fix calculate_slab_order()

If we triggered the 'offslab_limit' test, we would return with
cachep->gfporder incremented once too many times.

This clarifies the logic somewhat, and fixes that bug.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 2b0b1519bb74..f2e92dc1c9ce 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1628,36 +1628,36 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 			size_t size, size_t align, unsigned long flags)
 {
 	size_t left_over = 0;
+	int gfporder;
 
-	for (;; cachep->gfporder++) {
+	for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
 		unsigned int num;
 		size_t remainder;
 
-		if (cachep->gfporder > MAX_GFP_ORDER) {
-			cachep->num = 0;
-			break;
-		}
-
-		cache_estimate(cachep->gfporder, size, align, flags,
-			       &remainder, &num);
+		cache_estimate(gfporder, size, align, flags, &remainder, &num);
 		if (!num)
 			continue;
+
 		/* More than offslab_limit objects will cause problems */
-		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+		if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
 			break;
 
+		/* Found something acceptable - save it away */
 		cachep->num = num;
+		cachep->gfporder = gfporder;
 		left_over = remainder;
 
 		/*
 		 * Large number of objects is good, but very large slabs are
 		 * currently bad for the gfp()s.
 		 */
-		if (cachep->gfporder >= slab_break_gfp_order)
+		if (gfporder >= slab_break_gfp_order)
 			break;
 
-		if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
-			/* Acceptable internal fragmentation */
+		/*
+		 * Acceptable internal fragmentation?
+		 */
+		if ((left_over * 8) <= (PAGE_SIZE << gfporder))
 			break;
 	}
 	return left_over;
-- 
cgit v1.2.3


From 397874dfe9862b494e1fdcd2baef4ac432d224c8 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Mon, 6 Mar 2006 15:42:53 -0800
Subject: [PATCH] numa_maps update

Change the format of numa_maps to be more compact and contain additional
information that is useful for managing and troubleshooting memory on a
NUMA system.  Numa_maps can now also support huge pages.

Fixes:

1. More compact format. Only display fields if they contain additional
	information.

2. Always display information for all vmas. The old numa_maps did not display
	vma with no mapped entries. This was a bit confusing because page
	migration removes ptes for file backed vmas. After page migration
	a part of the vmas vanished.

3. Rename maxref to maxmap. This is the maximum mapcount of all the pages
	in a vma and may be used as an indicator as to how many processes
	may be using a certain vma.

4. Include the ability to scan over huge page vmas.

New items shown:

dirty
	Number of pages in a vma that have either the dirty bit set in the
	page_struct or in the pte.

file=<filename>
	The file backing the pages if any

stack
	Stack area

heap
	Heap area

huge
	Huge page area. The number of pages shows is the number of huge
	pages not the regular sized pages.

swapcache
	Number of pages with swap references. Must be >0 in order to
	be shown.

active
	Number of active pages. Only displayed if different from the number
	of pages mapped.

writeback
	Number of pages under writeback. Only displayed if >0.

Sample ouput of a process using huge pages:

00000000 default
2000000000000000 default file=/lib/ld-2.3.90.so mapped=13 mapmax=30 N0=13
2000000000044000 default file=/lib/ld-2.3.90.so anon=2 dirty=2 swapcache=2 N2=2
2000000000064000 default file=/lib/librt-2.3.90.so mapped=2 active=1 N1=1 N3=1
2000000000074000 default file=/lib/librt-2.3.90.so
2000000000080000 default file=/lib/librt-2.3.90.so anon=1 swapcache=1 N2=1
2000000000084000 default
2000000000088000 default file=/lib/libc-2.3.90.so mapped=52 mapmax=32 active=48 N0=52
20000000002bc000 default file=/lib/libc-2.3.90.so
20000000002c8000 default file=/lib/libc-2.3.90.so anon=3 dirty=2 swapcache=3 active=2 N1=1 N2=2
20000000002d4000 default anon=1 swapcache=1 N1=1
20000000002d8000 default file=/lib/libpthread-2.3.90.so mapped=8 mapmax=3 active=7 N2=2 N3=6
20000000002fc000 default file=/lib/libpthread-2.3.90.so
2000000000308000 default file=/lib/libpthread-2.3.90.so anon=1 dirty=1 swapcache=1 N1=1
200000000030c000 default anon=1 dirty=1 swapcache=1 N1=1
2000000000320000 default anon=1 dirty=1 N1=1
200000000071c000 default
2000000000720000 default anon=2 dirty=2 swapcache=1 N1=1 N2=1
2000000000f1c000 default
2000000000f20000 default anon=2 dirty=2 swapcache=1 active=1 N2=1 N3=1
200000000171c000 default
2000000001720000 default anon=1 dirty=1 swapcache=1 N1=1
2000000001b20000 default
2000000001b38000 default file=/lib/libgcc_s.so.1 mapped=2 N1=2
2000000001b48000 default file=/lib/libgcc_s.so.1
2000000001b54000 default file=/lib/libgcc_s.so.1 anon=1 dirty=1 active=0 N1=1
2000000001b58000 default file=/lib/libunwind.so.7.0.0 mapped=2 active=1 N1=2
2000000001b74000 default file=/lib/libunwind.so.7.0.0
2000000001b80000 default file=/lib/libunwind.so.7.0.0
2000000001b84000 default
4000000000000000 default file=/media/huge/test9 mapped=1 N1=1
6000000000000000 default file=/media/huge/test9 anon=1 dirty=1 active=0 N1=1
6000000000004000 default heap
607fffff7fffc000 default anon=1 dirty=1 swapcache=1 N2=1
607fffffff06c000 default stack anon=1 dirty=1 active=0 N1=1
8000000060000000 default file=/mnt/huge/test0 huge dirty=3 N1=3
8000000090000000 default file=/mnt/huge/test1 huge dirty=3 N0=1 N2=2
80000000c0000000 default file=/mnt/huge/test2 huge dirty=3 N1=1 N3=2

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 120 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 95 insertions(+), 25 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1a210088ad80..d80fa7d8f720 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -197,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	return policy;
 }
 
-static void gather_stats(struct page *, void *);
+static void gather_stats(struct page *, void *, int pte_dirty);
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags);
 
@@ -239,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			continue;
 
 		if (flags & MPOL_MF_STATS)
-			gather_stats(page, private);
+			gather_stats(page, private, pte_dirty(*pte));
 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(page, private, flags);
 		else
@@ -1753,67 +1753,137 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 struct numa_maps {
 	unsigned long pages;
 	unsigned long anon;
-	unsigned long mapped;
+	unsigned long active;
+	unsigned long writeback;
 	unsigned long mapcount_max;
+	unsigned long dirty;
+	unsigned long swapcache;
 	unsigned long node[MAX_NUMNODES];
 };
 
-static void gather_stats(struct page *page, void *private)
+static void gather_stats(struct page *page, void *private, int pte_dirty)
 {
 	struct numa_maps *md = private;
 	int count = page_mapcount(page);
 
-	if (count)
-		md->mapped++;
+	md->pages++;
+	if (pte_dirty || PageDirty(page))
+		md->dirty++;
 
-	if (count > md->mapcount_max)
-		md->mapcount_max = count;
+	if (PageSwapCache(page))
+		md->swapcache++;
 
-	md->pages++;
+	if (PageActive(page))
+		md->active++;
+
+	if (PageWriteback(page))
+		md->writeback++;
 
 	if (PageAnon(page))
 		md->anon++;
 
+	if (count > md->mapcount_max)
+		md->mapcount_max = count;
+
 	md->node[page_to_nid(page)]++;
 	cond_resched();
 }
 
+static void check_huge_range(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		struct numa_maps *md)
+{
+	unsigned long addr;
+	struct page *page;
+
+	for (addr = start; addr < end; addr += HPAGE_SIZE) {
+		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+		pte_t pte;
+
+		if (!ptep)
+			continue;
+
+		pte = *ptep;
+		if (pte_none(pte))
+			continue;
+
+		page = pte_page(pte);
+		if (!page)
+			continue;
+
+		gather_stats(page, md, pte_dirty(*ptep));
+	}
+}
+
 int show_numa_map(struct seq_file *m, void *v)
 {
 	struct task_struct *task = m->private;
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md;
+	struct file *file = vma->vm_file;
+	struct mm_struct *mm = vma->vm_mm;
 	int n;
 	char buffer[50];
 
-	if (!vma->vm_mm)
+	if (!mm)
 		return 0;
 
 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
 	if (!md)
 		return 0;
 
-	if (!is_vm_hugetlb_page(vma))
+	mpol_to_str(buffer, sizeof(buffer),
+			get_vma_policy(task, vma, vma->vm_start));
+
+	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+
+	if (file) {
+		seq_printf(m, " file=");
+		seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+		seq_printf(m, " heap");
+	} else if (vma->vm_start <= mm->start_stack &&
+			vma->vm_end >= mm->start_stack) {
+		seq_printf(m, " stack");
+	}
+
+	if (is_vm_hugetlb_page(vma)) {
+		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
+		seq_printf(m, " huge");
+	} else {
 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
-		    &node_online_map, MPOL_MF_STATS, md);
+				&node_online_map, MPOL_MF_STATS, md);
+	}
+
+	if (!md->pages)
+		goto out;
 
-	if (md->pages) {
-		mpol_to_str(buffer, sizeof(buffer),
-			    get_vma_policy(task, vma, vma->vm_start));
+	if (md->anon)
+		seq_printf(m," anon=%lu",md->anon);
 
-		seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
-			   vma->vm_start, buffer, md->pages,
-			   md->mapped, md->mapcount_max);
+	if (md->dirty)
+		seq_printf(m," dirty=%lu",md->dirty);
 
-		if (md->anon)
-			seq_printf(m," anon=%lu",md->anon);
+	if (md->pages != md->anon && md->pages != md->dirty)
+		seq_printf(m, " mapped=%lu", md->pages);
 
-		for_each_online_node(n)
-			if (md->node[n])
-				seq_printf(m, " N%d=%lu", n, md->node[n]);
+	if (md->mapcount_max > 1)
+		seq_printf(m, " mapmax=%lu", md->mapcount_max);
 
-		seq_putc(m, '\n');
-	}
+	if (md->swapcache)
+		seq_printf(m," swapcache=%lu", md->swapcache);
+
+	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+		seq_printf(m," active=%lu", md->active);
+
+	if (md->writeback)
+		seq_printf(m," writeback=%lu", md->writeback);
+
+	for_each_online_node(n)
+		if (md->node[n])
+			seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+	seq_putc(m, '\n');
 	kfree(md);
 
 	if (m->count < m->size)
-- 
cgit v1.2.3


From f78bb8ad482267b92c122f0e37a7dce69c880247 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 8 Mar 2006 10:33:05 -0800
Subject: slab: fix calculate_slab_order() for SLAB_RECLAIM_ACCOUNT

Instead of having a hard-to-read and confusing conditional in the
caller, just make the slab order calculation handle this special case,
since it's simple and obvious there.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index f2e92dc1c9ce..6ad6bd5a0b3e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1647,6 +1647,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 		cachep->gfporder = gfporder;
 		left_over = remainder;
 
+		/*
+		 * A VFS-reclaimable slab tends to have most allocations
+		 * as GFP_NOFS and we really don't want to have to be allocating
+		 * higher-order pages when we are unable to shrink dcache.
+		 */
+		if (flags & SLAB_RECLAIM_ACCOUNT)
+			break;
+
 		/*
 		 * Large number of objects is good, but very large slabs are
 		 * currently bad for the gfp()s.
@@ -1869,17 +1877,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 
 	size = ALIGN(size, align);
 
-	if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
-		/*
-		 * A VFS-reclaimable slab tends to have most allocations
-		 * as GFP_NOFS and we really don't want to have to be allocating
-		 * higher-order pages when we are unable to shrink dcache.
-		 */
-		cachep->gfporder = 0;
-		cache_estimate(cachep->gfporder, size, align, flags,
-			       &left_over, &cachep->num);
-	} else
-		left_over = calculate_slab_order(cachep, size, align, flags);
+	left_over = calculate_slab_order(cachep, size, align, flags);
 
 	if (!cachep->num) {
 		printk("kmem_cache_create: couldn't create cache %s.\n", name);
-- 
cgit v1.2.3


From 7f709ed0e3ccd3e88e0632b69f00174e83f8d98b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 7 Mar 2006 21:55:22 -0800
Subject: [PATCH] numa_maps-update fix

Fix the mm/mempolicy.c build for !CONFIG_HUGETLB_PAGE.

Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d80fa7d8f720..954981b14303 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1789,6 +1789,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
 	cond_resched();
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
 static void check_huge_range(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end,
 		struct numa_maps *md)
@@ -1814,6 +1815,13 @@ static void check_huge_range(struct vm_area_struct *vma,
 		gather_stats(page, md, pte_dirty(*ptep));
 	}
 }
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		struct numa_maps *md)
+{
+}
+#endif
 
 int show_numa_map(struct seq_file *m, void *v)
 {
-- 
cgit v1.2.3


From e2bab3d92486fb781f4d06f56339264ed1492392 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 7 Mar 2006 21:55:31 -0800
Subject: [PATCH] percpu_counter_sum()

Implement percpu_counter_sum().  This is a more accurate but slower version of
percpu_counter_read_positive().

We need this for Alex's speedup-ext3_statfs patch and for the nr_file
accounting fix.  Otherwise these things would be too inaccurate on large CPU
counts.

Cc: Ravikiran G Thirumalai <kiran@scalex86.org>
Cc: Alex Tomas <alex@clusterfs.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/percpu_counter.h |  6 ++++++
 mm/swap.c                      | 25 +++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index bd6708e2c027..682525511c9e 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -39,6 +39,7 @@ static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 }
 
 void percpu_counter_mod(struct percpu_counter *fbc, long amount);
+long percpu_counter_sum(struct percpu_counter *fbc);
 
 static inline long percpu_counter_read(struct percpu_counter *fbc)
 {
@@ -92,6 +93,11 @@ static inline long percpu_counter_read_positive(struct percpu_counter *fbc)
 	return fbc->count;
 }
 
+static inline long percpu_counter_sum(struct percpu_counter *fbc)
+{
+	return percpu_counter_read_positive(fbc);
+}
+
 #endif	/* CONFIG_SMP */
 
 static inline void percpu_counter_inc(struct percpu_counter *fbc)
diff --git a/mm/swap.c b/mm/swap.c
index cce3dda59c59..e9ec06d845e8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -489,13 +489,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount)
 	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
 		spin_lock(&fbc->lock);
 		fbc->count += count;
+		*pcount = 0;
 		spin_unlock(&fbc->lock);
-		count = 0;
+	} else {
+		*pcount = count;
 	}
-	*pcount = count;
 	put_cpu();
 }
 EXPORT_SYMBOL(percpu_counter_mod);
+
+/*
+ * Add up all the per-cpu counts, return the result.  This is a more accurate
+ * but much slower version of percpu_counter_read_positive()
+ */
+long percpu_counter_sum(struct percpu_counter *fbc)
+{
+	long ret;
+	int cpu;
+
+	spin_lock(&fbc->lock);
+	ret = fbc->count;
+	for_each_cpu(cpu) {
+		long *pcount = per_cpu_ptr(fbc->counters, cpu);
+		ret += *pcount;
+	}
+	spin_unlock(&fbc->lock);
+	return ret < 0 ? 0 : ret;
+}
+EXPORT_SYMBOL(percpu_counter_sum);
 #endif
 
 /*
-- 
cgit v1.2.3


From 07ed76b2a085a31f427c2a912a562627947dc7de Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Tue, 7 Mar 2006 21:55:46 -0800
Subject: [PATCH] slab: allocate larger cache_cache if order 0 fails

kmem_cache_init() incorrectly assumes that the cache_cache object will fit
in an order 0 allocation.  On very large systems, this is not true.  Change
the code to try larger order allocations if order 0 fails.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 6ad6bd5a0b3e..61800b88e241 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1124,6 +1124,7 @@ void __init kmem_cache_init(void)
 	struct cache_sizes *sizes;
 	struct cache_names *names;
 	int i;
+	int order;
 
 	for (i = 0; i < NUM_INIT_LISTS; i++) {
 		kmem_list3_init(&initkmem_list3[i]);
@@ -1167,11 +1168,15 @@ void __init kmem_cache_init(void)
 
 	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
 
-	cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
-		       &left_over, &cache_cache.num);
+	for (order = 0; order < MAX_ORDER; order++) {
+		cache_estimate(order, cache_cache.buffer_size,
+			cache_line_size(), 0, &left_over, &cache_cache.num);
+		if (cache_cache.num)
+			break;
+	}
 	if (!cache_cache.num)
 		BUG();
-
+	cache_cache.gfporder = order;
 	cache_cache.colour = left_over / cache_cache.colour_off;
 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
 				      sizeof(struct slab), cache_line_size());
-- 
cgit v1.2.3


From 85a6cd03a97f04ffff7bfedfa3172894ca9a617b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 9 Mar 2006 17:33:34 -0800
Subject: [PATCH] page_add_file_rmap(): remove BUG_ON()s

Remove two early-development BUG_ONs from page_add_file_rmap.

The pfn_valid test (originally useful for checking that nobody passed an
artificial struct page) comes too late, since we already have the struct
page.

The PageAnon test (useful when anon was first distinguished from file rmap)
prevents ->nopage implementations from reusing ->mapping, which would
otherwise be available.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index d8ce5ff61454..67f0e20b101f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -537,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page,
  */
 void page_add_file_rmap(struct page *page)
 {
-	BUG_ON(PageAnon(page));
-	BUG_ON(!pfn_valid(page_to_pfn(page)));
-
 	if (atomic_inc_and_test(&page->_mapcount))
 		__inc_page_state(nr_mapped);
 }
-- 
cgit v1.2.3


From a6bf527091b1dd40f1b6a496812ce7520621c282 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 9 Mar 2006 17:33:47 -0800
Subject: [PATCH] vmscan: no zone_reclaim if PF_MALLOC is set

If the process has already set PF_MALLOC and is already using
current->reclaim_state then do not try to reclaim memory from the zone.
This is set by kswapd and/or synchrononous global reclaim which will not
take it lightly if we zap the reclaim_state.

Signed-off-by: Christoph Lameter <clameter@sig.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0af7593d01e..7ccf763bb30b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1883,7 +1883,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	if (!(gfp_mask & __GFP_WAIT) ||
 		zone->all_unreclaimable ||
-		atomic_read(&zone->reclaim_in_progress) > 0)
+		atomic_read(&zone->reclaim_in_progress) > 0 ||
+		(p->flags & PF_MEMALLOC))
 			return 0;
 
 	node_id = zone->zone_pgdat->node_id;
-- 
cgit v1.2.3


From f2937be5895dbae23ff66767a2fc17793e63159c Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Thu, 9 Mar 2006 17:33:51 -0800
Subject: [PATCH] memory hotadd: pgdat->node_present_pages fix

When pages are onlined, not only zone->present_pages but also
pgdat->node_present_pages should be refreshed.

This parameter is used to show information at
/sys/device/system/node/nodeX/meminfo via si_meminfo_node().

So, it shows strange value for MemUsed which is calculated
(node_present_pages - all zones free pages).

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a918f77f02f3..1fe76d963ac2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 		onlined_pages++;
 	}
 	zone->present_pages += onlined_pages;
+	zone->zone_pgdat->node_present_pages += onlined_pages;
 
 	setup_per_zone_pages_min();
 
-- 
cgit v1.2.3


From 8fce4d8e3b9e3cf47cc8afeb6077e22ab795d989 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Thu, 9 Mar 2006 17:33:54 -0800
Subject: [PATCH] slab: Node rotor for freeing alien caches and remote per cpu
 pages.

The cache reaper currently tries to free all alien caches and all remote
per cpu pages in each pass of cache_reap.  For a machines with large number
of nodes (such as Altix) this may lead to sporadic delays of around ~10ms.
Interrupts are disabled while reclaiming creating unacceptable delays.

This patch changes that behavior by adding a per cpu reap_node variable.
Instead of attempting to free all caches, we free only one alien cache and
the per cpu pages from one remote node.  That reduces the time spend in
cache_reap.  However, doing so will lengthen the time it takes to
completely drain all remote per cpu pagesets and all alien caches.  The
time needed will grow with the number of nodes in the system.  All caches
are drained when they overflow their respective capacity.  So the drawback
here is only that a bit of memory may be wasted for awhile longer.

Details:

1. Rename drain_remote_pages to drain_node_pages to allow the specification
   of the node to drain of pcp pages.

2. Add additional functions init_reap_node, next_reap_node for NUMA
   that manage a per cpu reap_node counter.

3. Add a reap_alien function that reaps only from the current reap_node.

For us this seems to be a critical issue.  Holdoffs of an average of ~7ms
cause some HPC benchmarks to slow down significantly.  F.e.  NAS parallel
slows down dramatically.  NAS parallel has a 12-16 seconds runtime w/o rotor
compared to 5.8 secs with the rotor patches.  It gets down to 5.05 secs with
the additional interrupt holdoff reductions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/gfp.h |  4 ++--
 mm/page_alloc.c     | 17 +++++++-------
 mm/slab.c           | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 72 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 20f9148e38d9..7851e6b520cf 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -157,9 +157,9 @@ extern void FASTCALL(free_cold_page(struct page *page));
 
 void page_alloc_init(void);
 #ifdef CONFIG_NUMA
-void drain_remote_pages(void);
+void drain_node_pages(int node);
 #else
-static inline void drain_remote_pages(void) { };
+static inline void drain_node_pages(int node) { };
 #endif
 
 #endif /* __LINUX_GFP_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 791690d7d3fa..234bd4895d14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 }
 
 #ifdef CONFIG_NUMA
-/* Called from the slab reaper to drain remote pagesets */
-void drain_remote_pages(void)
+/*
+ * Called from the slab reaper to drain pagesets on a particular node that
+ * belong to the currently executing processor.
+ */
+void drain_node_pages(int nodeid)
 {
-	struct zone *zone;
-	int i;
+	int i, z;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	for_each_zone(zone) {
+	for (z = 0; z < MAX_NR_ZONES; z++) {
+		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
 		struct per_cpu_pageset *pset;
 
-		/* Do not drain local pagesets */
-		if (zone->zone_pgdat->node_id == numa_node_id())
-			continue;
-
 		pset = zone_pcp(zone, smp_processor_id());
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
diff --git a/mm/slab.c b/mm/slab.c
index 61800b88e241..d0bd7f07ab04 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
 	dump_stack();
 }
 
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, reap_node);
+
+static void init_reap_node(int cpu)
+{
+	int node;
+
+	node = next_node(cpu_to_node(cpu), node_online_map);
+	if (node == MAX_NUMNODES)
+		node = 0;
+
+	__get_cpu_var(reap_node) = node;
+}
+
+static void next_reap_node(void)
+{
+	int node = __get_cpu_var(reap_node);
+
+	/*
+	 * Also drain per cpu pages on remote zones
+	 */
+	if (node != numa_node_id())
+		drain_node_pages(node);
+
+	node = next_node(node, node_online_map);
+	if (unlikely(node >= MAX_NUMNODES))
+		node = first_node(node_online_map);
+	__get_cpu_var(reap_node) = node;
+}
+
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
+
 /*
  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
  * via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
 	 * at that time.
 	 */
 	if (keventd_up() && reap_work->func == NULL) {
+		init_reap_node(cpu);
 		INIT_WORK(reap_work, cache_reap, NULL);
 		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
 	}
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 	}
 }
 
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+{
+	int node = __get_cpu_var(reap_node);
+
+	if (l3->alien) {
+		struct array_cache *ac = l3->alien[node];
+		if (ac && ac->avail) {
+			spin_lock_irq(&ac->lock);
+			__drain_alien_cache(cachep, ac, node);
+			spin_unlock_irq(&ac->lock);
+		}
+	}
+}
+
 static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
 	int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
 #else
 
 #define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
 
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -3497,8 +3557,7 @@ static void cache_reap(void *unused)
 		check_irq_on();
 
 		l3 = searchp->nodelists[numa_node_id()];
-		if (l3->alien)
-			drain_alien_cache(searchp, l3->alien);
+		reap_alien(searchp, l3);
 		spin_lock_irq(&l3->list_lock);
 
 		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3548,7 +3607,7 @@ static void cache_reap(void *unused)
 	}
 	check_irq_on();
 	mutex_unlock(&cache_chain_mutex);
-	drain_remote_pages();
+	next_reap_node();
 	/* Setup the next iteration */
 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
-- 
cgit v1.2.3


From 4983da07f1e2e8dc81cb9d640fbf35b899cdbdf2 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Tue, 14 Mar 2006 19:50:19 -0800
Subject: [PATCH] page migration: fail if page is in a vma flagged VM_LOCKED

page migration currently simply retries a couple of times if try_to_unmap()
fails without inspecting the return code.

However, SWAP_FAIL indicates that the page is in a vma that has the
VM_LOCKED flag set (if ignore_refs ==1).  We can check for that return code
and avoid retrying the migration.

migrate_page_remove_references() now needs to return a reason why the
failure occured.  So switch migrate_page_remove_references to use -Exx
style error messages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c |  6 ++++--
 mm/vmscan.c | 18 ++++++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/fs/buffer.c b/fs/buffer.c
index 62cfd17dc5fe..a9b399402007 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3060,6 +3060,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	struct buffer_head *bh, *head;
+	int rc;
 
 	if (!mapping)
 		return -EAGAIN;
@@ -3069,8 +3070,9 @@ int buffer_migrate_page(struct page *newpage, struct page *page)
 
 	head = page_buffers(page);
 
-	if (migrate_page_remove_references(newpage, page, 3))
-		return -EAGAIN;
+	rc = migrate_page_remove_references(newpage, page, 3);
+	if (rc)
+		return rc;
 
 	bh = head;
 	do {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7ccf763bb30b..4fe7e3aa02e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -700,7 +700,7 @@ int migrate_page_remove_references(struct page *newpage,
 	 * the page.
 	 */
 	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-		return 1;
+		return -EAGAIN;
 
 	/*
 	 * Establish swap ptes for anonymous pages or destroy pte
@@ -721,13 +721,15 @@ int migrate_page_remove_references(struct page *newpage,
 	 * If the page was not migrated then the PageSwapCache bit
 	 * is still set and the operation may continue.
 	 */
-	try_to_unmap(page, 1);
+	if (try_to_unmap(page, 1) == SWAP_FAIL)
+		/* A vma has VM_LOCKED set -> Permanent failure */
+		return -EPERM;
 
 	/*
 	 * Give up if we were unable to remove all mappings.
 	 */
 	if (page_mapcount(page))
-		return 1;
+		return -EAGAIN;
 
 	write_lock_irq(&mapping->tree_lock);
 
@@ -738,7 +740,7 @@ int migrate_page_remove_references(struct page *newpage,
 	if (!page_mapping(page) || page_count(page) != nr_refs ||
 			*radix_pointer != page) {
 		write_unlock_irq(&mapping->tree_lock);
-		return 1;
+		return -EAGAIN;
 	}
 
 	/*
@@ -813,10 +815,14 @@ EXPORT_SYMBOL(migrate_page_copy);
  */
 int migrate_page(struct page *newpage, struct page *page)
 {
+	int rc;
+
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	if (migrate_page_remove_references(newpage, page, 2))
-		return -EAGAIN;
+	rc = migrate_page_remove_references(newpage, page, 2);
+
+	if (rc)
+		return rc;
 
 	migrate_page_copy(newpage, page);
 
-- 
cgit v1.2.3


From 74c002410548c7cb1744b45d17a5fa21da515b63 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Tue, 14 Mar 2006 19:50:21 -0800
Subject: [PATCH] Consistent capabilites associated with MPOL_MOVE_ALL

It seems that setting scheduling policy and priorities is also the kind of
thing that might be performed in apps that also use the NUMA API, so it
would seem consistent to use CAP_SYS_NICE for NUMA also.

So use CAP_SYS_NICE for controlling migration permissions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 954981b14303..2a8206009422 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -748,7 +748,7 @@ long do_mbind(unsigned long start, unsigned long len,
 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 	    || mode > MPOL_MAX)
 		return -EINVAL;
-	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 		return -EPERM;
 
 	if (start & ~PAGE_MASK)
@@ -942,20 +942,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	 */
 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
 	    (current->uid != task->suid) && (current->uid != task->uid) &&
-	    !capable(CAP_SYS_ADMIN)) {
+	    !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out;
 	}
 
 	task_nodes = cpuset_mems_allowed(task);
 	/* Is the user allowed to access the target nodes? */
-	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out;
 	}
 
 	err = do_migrate_pages(mm, &old, &new,
-		capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
 	mmput(mm);
 	return err;
-- 
cgit v1.2.3


From 90036ee5938d89638e80f4d0d0700d0f2dbd4a6a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 16 Mar 2006 23:03:59 -0800
Subject: [PATCH] page migration: Fail with error if swap not setup

Currently the migration of anonymous pages will silently fail if no swap is
setup.  This patch makes page migration functions check for available swap
and fail with -ENODEV if no swap space is available.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2a8206009422..b21869a39f0b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -330,9 +330,19 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
 
-	/* Clear the LRU lists so pages can be isolated */
-	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+		/* Must have swap device for migration */
+		if (nr_swap_pages <= 0)
+			return ERR_PTR(-ENODEV);
+
+		/*
+		 * Clear the LRU lists so pages can be isolated.
+		 * Note that pages may be moved off the LRU after we have
+		 * drained them. Those pages will fail to migrate like other
+		 * pages that may be busy.
+		 */
 		lru_add_drain_all();
+	}
 
 	first = find_vma(mm, start);
 	if (!first)
-- 
cgit v1.2.3


From 5b40dc780ed996162f3af8712eb03beb24dcdbef Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 16 Mar 2006 23:04:07 -0800
Subject: [PATCH] fix race in pagevec_strip?

We can call try_to_release_page() with PagePrivate off and a valid
page->mapping This may cause all sorts of trouble for the filesystem
*_releasepage() handlers.  XFS bombs out in that case.

Lock the page before checking for page private.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index e9ec06d845e8..b524ea90bddb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -393,7 +393,8 @@ void pagevec_strip(struct pagevec *pvec)
 		struct page *page = pvec->pages[i];
 
 		if (PagePrivate(page) && !TestSetPageLocked(page)) {
-			try_to_release_page(page, 0);
+			if (PagePrivate(page))
+				try_to_release_page(page, 0);
 			unlock_page(page);
 		}
 	}
-- 
cgit v1.2.3


From 6f5e6b9e69bf043074a0edabe3d271899c34eb79 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 16 Mar 2006 23:04:09 -0800
Subject: [PATCH] fix free swap cache latency

Lee Revell reported 28ms latency when process with lots of swapped memory
exits.

2.6.15 introduced a latency regression when unmapping: in accounting the
zap_work latency breaker, pte_none counted 1, pte_present PAGE_SIZE, but a
swap entry counted nothing at all.  We think of pages present as the slow
case, but Lee's trace shows that free_swap_and_cache's radix tree lookup
can make a lot of work - and we could have been doing it many thousands of
times without a latency break.

Move the zap_work update up to account swap entries like pages present.
This does account non-linear pte_file entries, and unmap_mapping_range
skipping over swap entries, by the same amount even though they're quick:
but neither of those cases deserves complicating the code (and they're
treated no worse than they were in 2.6.14).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 9abc6008544b..85e80a57db29 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -623,11 +623,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			(*zap_work)--;
 			continue;
 		}
+
+		(*zap_work) -= PAGE_SIZE;
+
 		if (pte_present(ptent)) {
 			struct page *page;
 
-			(*zap_work) -= PAGE_SIZE;
-
 			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
-- 
cgit v1.2.3