From 409001948d9f221c94a61c3ee96de112755fc04d Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Wed, 22 Oct 2008 05:53:45 +0000
Subject: powerpc: Update page-in counter for CMM

A new field has been added to the VPA as a method for the client OS to
communicate to firmware the number of page-ins it is performing when
running collaborative memory overcommit.  The hypervisor will use this
information to better determine if a partition is experiencing memory
pressure and needs more memory allocated to it.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/fault.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 565b7a237c84..b18bc0f023c8 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -30,6 +30,7 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 
+#include <asm/firmware.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -318,9 +319,16 @@ good_area:
 			goto do_sigbus;
 		BUG();
 	}
-	if (ret & VM_FAULT_MAJOR)
+	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-	else
+#ifdef CONFIG_PPC_SMLPAR
+		if (firmware_has_feature(FW_FEATURE_CMO)) {
+			preempt_disable();
+			get_lppaca()->page_ins++;
+			preempt_enable();
+		}
+#endif
+	} else
 		current->min_flt++;
 	up_read(&mm->mmap_sem);
 	return 0;
-- 
cgit v1.2.3


From 7d4320f3d5ace5758111f2beac931376737f80f5 Mon Sep 17 00:00:00 2001
From: Jon Tollefson <kniht@linux.vnet.ibm.com>
Date: Thu, 30 Oct 2008 12:03:57 +0000
Subject: powerpc: Hugetlb pgtable cache access cleanup

Andrew Morton suggested that using a macro that makes an array
reference look like a function call makes it harder to understand the
code.

This therefore removes the huge_pgtable_cache(psize) macro and
replaces its uses with pgtable_cache[HUGE_PGTABLE_INDEX(psize)].

Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/hugetlbpage.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a117024ab8cd..c2231358adbb 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -53,8 +53,7 @@ unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
 
 /* Subtract one from array size because we don't need a cache for 4K since
  * is not a huge page size */
-#define huge_pgtable_cache(psize)	(pgtable_cache[HUGEPTE_CACHE_NUM \
-							+ psize-1])
+#define HUGE_PGTABLE_INDEX(psize)	(HUGEPTE_CACHE_NUM + psize - 1)
 #define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
 
 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
@@ -113,7 +112,7 @@ static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
+	pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
 				      GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
@@ -121,7 +120,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 
 	spin_lock(&mm->page_table_lock);
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(huge_pgtable_cache(psize), new);
+		kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
 	else
 		hpdp->pd = (unsigned long)new | HUGEPD_OK;
 	spin_unlock(&mm->page_table_lock);
@@ -760,13 +759,14 @@ static int __init hugetlbpage_init(void)
 
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 		if (mmu_huge_psizes[psize]) {
-			huge_pgtable_cache(psize) = kmem_cache_create(
-						HUGEPTE_CACHE_NAME(psize),
-						HUGEPTE_TABLE_SIZE(psize),
-						HUGEPTE_TABLE_SIZE(psize),
-						0,
-						NULL);
-			if (!huge_pgtable_cache(psize))
+			pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
+				kmem_cache_create(
+					HUGEPTE_CACHE_NAME(psize),
+					HUGEPTE_TABLE_SIZE(psize),
+					HUGEPTE_TABLE_SIZE(psize),
+					0,
+					NULL);
+			if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
 				panic("hugetlbpage_init(): could not create %s"\
 				      "\n", HUGEPTE_CACHE_NAME(psize));
 		}
-- 
cgit v1.2.3


From a6326e98a28d8a57f693369c82559543c6950f09 Mon Sep 17 00:00:00 2001
From: Robert Jennings <rcj@linux.vnet.ibm.com>
Date: Fri, 14 Nov 2008 12:07:34 +0000
Subject: powerpc: Correct page-in counter for CMM with 64k pages

Linux will report the number of page-ins so that the hypervisor can
better determine partition memory pressure.  The hardware page size
and the OS page size can be different.  In the case where the hardware
page size is 4k and the OS is running with 64k pages the code in
commit 409001948d9f221c94a61c3ee96de112755fc04d ("powerpc: Update
page-in counter for CMM") would under-report the number of pages.

This corrects the reporting to the hypervisor by incrementing the
page_in count by 1 << PAGE_FACTOR each time.

Reported-by: Andrew Theurer <habanero@linux.vnet.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b18bc0f023c8..7df0409107ad 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -324,7 +324,7 @@ good_area:
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
-			get_lppaca()->page_ins++;
+			get_lppaca()->page_ins += (1 << PAGE_FACTOR);
 			preempt_enable();
 		}
 #endif
-- 
cgit v1.2.3


From f4f3a1261ad70988ad45614ebc87e553143a332b Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Wed, 19 Nov 2008 05:53:04 +0000
Subject: powerpc: hash_page_sync should only be used on SMP & STD_MMU_32

Clean up the ifdefs so we only use hash_page_sync if we have
CONFIG_SMP && CONFIG_PPC_STD_MMU_32.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/pgtable_32.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index c31d6d26f0b5..44fbc81c9b2c 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -48,7 +48,7 @@ EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
 
 extern char etext[], _stext[];
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
 extern void hash_page_sync(void);
 #endif
 
@@ -127,7 +127,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 
 void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
 	hash_page_sync();
 #endif
 	free_page((unsigned long)pte);
@@ -135,7 +135,7 @@ void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 
 void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
 	hash_page_sync();
 #endif
 	pgtable_page_dtor(ptepage);
-- 
cgit v1.2.3


From 0186f47e703fb7aa14b54459d642ef5374b3a685 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Wed, 19 Nov 2008 12:50:04 +0000
Subject: powerpc: Use RCU based pte freeing mechanism for all powerpc

Refactor the RCU based pte free code that was used on ppc64 to be used
on all powerpc.

Additionally refactor pte_free() & pte_free_kernel() into common code
between ppc32 & ppc64.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/pgalloc-32.h |  11 +++-
 arch/powerpc/include/asm/pgalloc-64.h |  34 ----------
 arch/powerpc/include/asm/pgalloc.h    |  41 ++++++++++++
 arch/powerpc/mm/Makefile              |   2 +-
 arch/powerpc/mm/hash_low_32.S         |  30 ---------
 arch/powerpc/mm/pgtable.c             | 117 ++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/pgtable_32.c          |  21 ------
 arch/powerpc/mm/tlb_64.c              |  86 -------------------------
 8 files changed, 167 insertions(+), 175 deletions(-)
 create mode 100644 arch/powerpc/mm/pgtable.c

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h
index 58c07147b3ea..0815eb40acae 100644
--- a/arch/powerpc/include/asm/pgalloc-32.h
+++ b/arch/powerpc/include/asm/pgalloc-32.h
@@ -3,6 +3,8 @@
 
 #include <linux/threads.h>
 
+#define PTE_NONCACHE_NUM	0  /* dummy for now to share code w/ppc64 */
+
 extern void __bad_pte(pmd_t *pmd);
 
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
@@ -33,10 +35,13 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
 extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
-extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte);
-extern void pte_free(struct mm_struct *mm, pgtable_t pte);
 
-#define __pte_free_tlb(tlb, pte)	pte_free((tlb)->mm, (pte))
+static inline void pgtable_free(pgtable_free_t pgf)
+{
+	void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK);
+
+	free_page((unsigned long)p);
+}
 
 #define check_pgt_cache()	do { } while (0)
 
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index 812a1d8f35cb..afda2bdd860f 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -7,7 +7,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
@@ -108,31 +107,6 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
 	return page;
 }
 
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-	pgtable_page_dtor(ptepage);
-	__free_page(ptepage);
-}
-
-#define PGF_CACHENUM_MASK	0x7
-
-typedef struct pgtable_free {
-	unsigned long val;
-} pgtable_free_t;
-
-static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum,
-						unsigned long mask)
-{
-	BUG_ON(cachenum > PGF_CACHENUM_MASK);
-
-	return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum};
-}
-
 static inline void pgtable_free(pgtable_free_t pgf)
 {
 	void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK);
@@ -144,14 +118,6 @@ static inline void pgtable_free(pgtable_free_t pgf)
 		kmem_cache_free(pgtable_cache[cachenum], p);
 }
 
-extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
-
-#define __pte_free_tlb(tlb,ptepage)	\
-do { \
-	pgtable_page_dtor(ptepage); \
-	pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \
-		PTE_NONCACHE_NUM, PTE_TABLE_SIZE-1)); \
-} while (0)
 #define __pmd_free_tlb(tlb, pmd) 	\
 	pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \
 		PMD_CACHE_NUM, PMD_TABLE_SIZE-1))
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index b4505ed0f0f2..5d8480265a77 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -2,11 +2,52 @@
 #define _ASM_POWERPC_PGALLOC_H
 #ifdef __KERNEL__
 
+#include <linux/mm.h>
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+	free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+{
+	pgtable_page_dtor(ptepage);
+	__free_page(ptepage);
+}
+
+typedef struct pgtable_free {
+	unsigned long val;
+} pgtable_free_t;
+
+#define PGF_CACHENUM_MASK	0x7
+
+static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum,
+						unsigned long mask)
+{
+	BUG_ON(cachenum > PGF_CACHENUM_MASK);
+
+	return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum};
+}
+
 #ifdef CONFIG_PPC64
 #include <asm/pgalloc-64.h>
 #else
 #include <asm/pgalloc-32.h>
 #endif
 
+extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
+
+#ifdef CONFIG_SMP
+#define __pte_free_tlb(tlb,ptepage)	\
+do { \
+	pgtable_page_dtor(ptepage); \
+	pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \
+		PTE_NONCACHE_NUM, PTE_TABLE_SIZE-1)); \
+} while (0)
+#else
+#define __pte_free_tlb(tlb, pte)	pte_free((tlb)->mm, (pte))
+#endif
+
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_PGALLOC_H */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index e7392b45a5ef..86e657bcfa7e 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -6,7 +6,7 @@ ifeq ($(CONFIG_PPC64),y)
 EXTRA_CFLAGS	+= -mno-minimal-toc
 endif
 
-obj-y				:= fault.o mem.o \
+obj-y				:= fault.o mem.o pgtable.o \
 				   init_$(CONFIG_WORD_SIZE).o \
 				   pgtable_$(CONFIG_WORD_SIZE).o \
 				   mmu_context_$(CONFIG_WORD_SIZE).o
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 7bffb70b9fe2..c5536b8b37a9 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -35,36 +35,6 @@ mmu_hash_lock:
 	.space	4
 #endif /* CONFIG_SMP */
 
-/*
- * Sync CPUs with hash_page taking & releasing the hash
- * table lock
- */
-#ifdef CONFIG_SMP
-	.text
-_GLOBAL(hash_page_sync)
-	mfmsr   r10
-	rlwinm  r0,r10,0,17,15          /* clear bit 16 (MSR_EE) */
-	mtmsr   r0
-	lis	r8,mmu_hash_lock@h
-	ori	r8,r8,mmu_hash_lock@l
-	lis	r0,0x0fff
-	b	10f
-11:	lwz	r6,0(r8)
-	cmpwi	0,r6,0
-	bne	11b
-10:	lwarx	r6,0,r8
-	cmpwi	0,r6,0
-	bne-	11b
-	stwcx.	r0,0,r8
-	bne-	10b
-	isync
-	eieio
-	li	r0,0
-	stw	r0,0(r8)
-	mtmsr	r10
-	blr
-#endif /* CONFIG_SMP */
-
 /*
  * Load a PTE into the hash table, if possible.
  * The address is in r4, and r3 contains an access flag:
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
new file mode 100644
index 000000000000..6d94116fdea1
--- /dev/null
+++ b/arch/powerpc/mm/pgtable.c
@@ -0,0 +1,117 @@
+/*
+ * This file contains common routines for dealing with free of page tables
+ *
+ *  Derived from arch/powerpc/mm/tlb_64.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+static unsigned long pte_freelist_forced_free;
+
+struct pte_freelist_batch
+{
+	struct rcu_head	rcu;
+	unsigned int	index;
+	pgtable_free_t	tables[0];
+};
+
+#define PTE_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
+	  / sizeof(pgtable_free_t))
+
+static void pte_free_smp_sync(void *arg)
+{
+	/* Do nothing, just ensure we sync with all CPUs */
+}
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+static void pgtable_free_now(pgtable_free_t pgf)
+{
+	pte_freelist_forced_free++;
+
+	smp_call_function(pte_free_smp_sync, NULL, 1);
+
+	pgtable_free(pgf);
+}
+
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+	struct pte_freelist_batch *batch =
+		container_of(head, struct pte_freelist_batch, rcu);
+	unsigned int i;
+
+	for (i = 0; i < batch->index; i++)
+		pgtable_free(batch->tables[i]);
+
+	free_page((unsigned long)batch);
+}
+
+static void pte_free_submit(struct pte_freelist_batch *batch)
+{
+	INIT_RCU_HEAD(&batch->rcu);
+	call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+{
+	/* This is safe since tlb_gather_mmu has disabled preemption */
+        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
+	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+	if (atomic_read(&tlb->mm->mm_users) < 2 ||
+	    cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
+		pgtable_free(pgf);
+		return;
+	}
+
+	if (*batchp == NULL) {
+		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+		if (*batchp == NULL) {
+			pgtable_free_now(pgf);
+			return;
+		}
+		(*batchp)->index = 0;
+	}
+	(*batchp)->tables[(*batchp)->index++] = pgf;
+	if ((*batchp)->index == PTE_FREELIST_SIZE) {
+		pte_free_submit(*batchp);
+		*batchp = NULL;
+	}
+}
+
+void pte_free_finish(void)
+{
+	/* This is safe since tlb_gather_mmu has disabled preemption */
+	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+	if (*batchp == NULL)
+		return;
+	pte_free_submit(*batchp);
+	*batchp = NULL;
+}
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 44fbc81c9b2c..c7b755cba26a 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -48,10 +48,6 @@ EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
 
 extern char etext[], _stext[];
 
-#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
-extern void hash_page_sync(void);
-#endif
-
 #ifdef HAVE_BATS
 extern phys_addr_t v_mapped_by_bats(unsigned long va);
 extern unsigned long p_mapped_by_bats(phys_addr_t pa);
@@ -125,23 +121,6 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	return ptepage;
 }
 
-void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
-	hash_page_sync();
-#endif
-	free_page((unsigned long)pte);
-}
-
-void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_32)
-	hash_page_sync();
-#endif
-	pgtable_page_dtor(ptepage);
-	__free_page(ptepage);
-}
-
 void __iomem *
 ioremap(phys_addr_t addr, unsigned long size)
 {
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index be7dd422c0fa..c931bc7d1079 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -37,81 +37,6 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
  * arch/powerpc/include/asm/tlb.h file -- tgall
  */
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
-static unsigned long pte_freelist_forced_free;
-
-struct pte_freelist_batch
-{
-	struct rcu_head	rcu;
-	unsigned int	index;
-	pgtable_free_t	tables[0];
-};
-
-#define PTE_FREELIST_SIZE \
-	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
-	  / sizeof(pgtable_free_t))
-
-static void pte_free_smp_sync(void *arg)
-{
-	/* Do nothing, just ensure we sync with all CPUs */
-}
-
-/* This is only called when we are critically out of memory
- * (and fail to get a page in pte_free_tlb).
- */
-static void pgtable_free_now(pgtable_free_t pgf)
-{
-	pte_freelist_forced_free++;
-
-	smp_call_function(pte_free_smp_sync, NULL, 1);
-
-	pgtable_free(pgf);
-}
-
-static void pte_free_rcu_callback(struct rcu_head *head)
-{
-	struct pte_freelist_batch *batch =
-		container_of(head, struct pte_freelist_batch, rcu);
-	unsigned int i;
-
-	for (i = 0; i < batch->index; i++)
-		pgtable_free(batch->tables[i]);
-
-	free_page((unsigned long)batch);
-}
-
-static void pte_free_submit(struct pte_freelist_batch *batch)
-{
-	INIT_RCU_HEAD(&batch->rcu);
-	call_rcu(&batch->rcu, pte_free_rcu_callback);
-}
-
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
-{
-	/* This is safe since tlb_gather_mmu has disabled preemption */
-        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
-	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
-
-	if (atomic_read(&tlb->mm->mm_users) < 2 ||
-	    cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
-		pgtable_free(pgf);
-		return;
-	}
-
-	if (*batchp == NULL) {
-		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
-		if (*batchp == NULL) {
-			pgtable_free_now(pgf);
-			return;
-		}
-		(*batchp)->index = 0;
-	}
-	(*batchp)->tables[(*batchp)->index++] = pgf;
-	if ((*batchp)->index == PTE_FREELIST_SIZE) {
-		pte_free_submit(*batchp);
-		*batchp = NULL;
-	}
-}
 
 /*
  * A linux PTE was changed and the corresponding hash table entry
@@ -229,17 +154,6 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 	batch->index = 0;
 }
 
-void pte_free_finish(void)
-{
-	/* This is safe since tlb_gather_mmu has disabled preemption */
-	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
-
-	if (*batchp == NULL)
-		return;
-	pte_free_submit(*batchp);
-	*batchp = NULL;
-}
-
 /**
  * __flush_hash_table_range - Flush all HPTEs for a given address range
  *                            from the hash table (and the TLB). But keeps
-- 
cgit v1.2.3


From e41e811a79a4e328005be2744c3076ebde455088 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 14 Dec 2008 19:44:39 +0000
Subject: powerpc/mm: Rename tlb_32.c and tlb_64.c to tlb_hash32.c and
 tlb_hash64.c

This renames the files to clarify the fact that they are used by
the hash based family of CPUs (the 603 being an exception in that
family but is still handled by that code).

This paves the way for the new tlb_nohash.c coming via a subsequent
commit.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/Makefile     |   2 +-
 arch/powerpc/mm/tlb_32.c     | 190 --------------------------------------
 arch/powerpc/mm/tlb_64.c     | 211 -------------------------------------------
 arch/powerpc/mm/tlb_hash32.c | 190 ++++++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/tlb_hash64.c | 211 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 402 insertions(+), 402 deletions(-)
 delete mode 100644 arch/powerpc/mm/tlb_32.c
 delete mode 100644 arch/powerpc/mm/tlb_64.c
 create mode 100644 arch/powerpc/mm/tlb_hash32.c
 create mode 100644 arch/powerpc/mm/tlb_hash64.c

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 86e657bcfa7e..148de35c9eef 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
 				   gup.o mmap.o $(hash-y)
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
-				   tlb_$(CONFIG_WORD_SIZE).o
+				   tlb_hash$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_40x)		+= 40x_mmu.o
 obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)		+= fsl_booke_mmu.o
diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_32.c
deleted file mode 100644
index f9a47fee3927..000000000000
--- a/arch/powerpc/mm/tlb_32.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * This file contains the routines for TLB flushing.
- * On machines where the MMU uses a hash table to store virtual to
- * physical translations, these routines flush entries from the
- * hash table also.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-
-#include "mmu_decl.h"
-
-/*
- * Called when unmapping pages to flush entries from the TLB/hash table.
- */
-void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
-{
-	unsigned long ptephys;
-
-	if (Hash != 0) {
-		ptephys = __pa(ptep) & PAGE_MASK;
-		flush_hash_pages(mm->context.id, addr, ptephys, 1);
-	}
-}
-EXPORT_SYMBOL(flush_hash_entry);
-
-/*
- * Called by ptep_set_access_flags, must flush on CPUs for which the
- * DSI handler can't just "fixup" the TLB on a write fault
- */
-void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr)
-{
-	if (Hash != 0)
-		return;
-	_tlbie(addr);
-}
-
-/*
- * Called at the end of a mmu_gather operation to make sure the
- * TLB flush is completely done.
- */
-void tlb_flush(struct mmu_gather *tlb)
-{
-	if (Hash == 0) {
-		/*
-		 * 603 needs to flush the whole TLB here since
-		 * it doesn't use a hash table.
-		 */
-		_tlbia();
-	}
-}
-
-/*
- * TLB flushing:
- *
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes kernel pages
- *
- * since the hardware hash table functions as an extension of the
- * tlb as far as the linux tables are concerned, flush it too.
- *    -- Cort
- */
-
-/*
- * 750 SMP is a Bad Idea because the 750 doesn't broadcast all
- * the cache operations on the bus.  Hence we need to use an IPI
- * to get the other CPU(s) to invalidate their TLBs.
- */
-#ifdef CONFIG_SMP_750
-#define FINISH_FLUSH	smp_send_tlb_invalidate(0)
-#else
-#define FINISH_FLUSH	do { } while (0)
-#endif
-
-static void flush_range(struct mm_struct *mm, unsigned long start,
-			unsigned long end)
-{
-	pmd_t *pmd;
-	unsigned long pmd_end;
-	int count;
-	unsigned int ctx = mm->context.id;
-
-	if (Hash == 0) {
-		_tlbia();
-		return;
-	}
-	start &= PAGE_MASK;
-	if (start >= end)
-		return;
-	end = (end - 1) | ~PAGE_MASK;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
-	for (;;) {
-		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
-		if (pmd_end > end)
-			pmd_end = end;
-		if (!pmd_none(*pmd)) {
-			count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
-			flush_hash_pages(ctx, start, pmd_val(*pmd), count);
-		}
-		if (pmd_end == end)
-			break;
-		start = pmd_end + 1;
-		++pmd;
-	}
-}
-
-/*
- * Flush kernel TLB entries in the given range
- */
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
-	flush_range(&init_mm, start, end);
-	FINISH_FLUSH;
-}
-
-/*
- * Flush all the (user) entries for the address space described by mm.
- */
-void flush_tlb_mm(struct mm_struct *mm)
-{
-	struct vm_area_struct *mp;
-
-	if (Hash == 0) {
-		_tlbia();
-		return;
-	}
-
-	/*
-	 * It is safe to go down the mm's list of vmas when called
-	 * from dup_mmap, holding mmap_sem.  It would also be safe from
-	 * unmap_region or exit_mmap, but not from vmtruncate on SMP -
-	 * but it seems dup_mmap is the only SMP case which gets here.
-	 */
-	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
-		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
-	FINISH_FLUSH;
-}
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-	struct mm_struct *mm;
-	pmd_t *pmd;
-
-	if (Hash == 0) {
-		_tlbie(vmaddr);
-		return;
-	}
-	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
-	if (!pmd_none(*pmd))
-		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
-	FINISH_FLUSH;
-}
-
-/*
- * For each address in the range, find the pte for the address
- * and check _PAGE_HASHPTE bit; if it is set, find and destroy
- * the corresponding HPTE.
- */
-void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
-		     unsigned long end)
-{
-	flush_range(vma->vm_mm, start, end);
-	FINISH_FLUSH;
-}
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
deleted file mode 100644
index c931bc7d1079..000000000000
--- a/arch/powerpc/mm/tlb_64.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * This file contains the routines for flushing entries from the
- * TLB and MMU hash table.
- *
- *  Derived from arch/ppc64/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  Dave Engebretsen <engebret@us.ibm.com>
- *      Rework for PPC64 port.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-#include <asm/bug.h>
-
-DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
-
-/* This is declared as we are using the more or less generic
- * arch/powerpc/include/asm/tlb.h file -- tgall
- */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
-/*
- * A linux PTE was changed and the corresponding hash table entry
- * neesd to be flushed. This function will either perform the flush
- * immediately or will batch it up if the current CPU has an active
- * batch on it.
- *
- * Must be called from within some kind of spinlock/non-preempt region...
- */
-void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, unsigned long pte, int huge)
-{
-	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
-	unsigned long vsid, vaddr;
-	unsigned int psize;
-	int ssize;
-	real_pte_t rpte;
-	int i;
-
-	i = batch->index;
-
-	/* We mask the address for the base page size. Huge pages will
-	 * have applied their own masking already
-	 */
-	addr &= PAGE_MASK;
-
-	/* Get page size (maybe move back to caller).
-	 *
-	 * NOTE: when using special 64K mappings in 4K environment like
-	 * for SPEs, we obtain the page size from the slice, which thus
-	 * must still exist (and thus the VMA not reused) at the time
-	 * of this call
-	 */
-	if (huge) {
-#ifdef CONFIG_HUGETLB_PAGE
-		psize = get_slice_psize(mm, addr);;
-#else
-		BUG();
-		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
-#endif
-	} else
-		psize = pte_pagesize_index(mm, addr, pte);
-
-	/* Build full vaddr */
-	if (!is_kernel_addr(addr)) {
-		ssize = user_segment_size(addr);
-		vsid = get_vsid(mm->context.id, addr, ssize);
-		WARN_ON(vsid == 0);
-	} else {
-		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-		ssize = mmu_kernel_ssize;
-	}
-	vaddr = hpt_va(addr, vsid, ssize);
-	rpte = __real_pte(__pte(pte), ptep);
-
-	/*
-	 * Check if we have an active batch on this CPU. If not, just
-	 * flush now and return. For now, we don global invalidates
-	 * in that case, might be worth testing the mm cpu mask though
-	 * and decide to use local invalidates instead...
-	 */
-	if (!batch->active) {
-		flush_hash_page(vaddr, rpte, psize, ssize, 0);
-		return;
-	}
-
-	/*
-	 * This can happen when we are in the middle of a TLB batch and
-	 * we encounter memory pressure (eg copy_page_range when it tries
-	 * to allocate a new pte). If we have to reclaim memory and end
-	 * up scanning and resetting referenced bits then our batch context
-	 * will change mid stream.
-	 *
-	 * We also need to ensure only one page size is present in a given
-	 * batch
-	 */
-	if (i != 0 && (mm != batch->mm || batch->psize != psize ||
-		       batch->ssize != ssize)) {
-		__flush_tlb_pending(batch);
-		i = 0;
-	}
-	if (i == 0) {
-		batch->mm = mm;
-		batch->psize = psize;
-		batch->ssize = ssize;
-	}
-	batch->pte[i] = rpte;
-	batch->vaddr[i] = vaddr;
-	batch->index = ++i;
-	if (i >= PPC64_TLB_BATCH_NR)
-		__flush_tlb_pending(batch);
-}
-
-/*
- * This function is called when terminating an mmu batch or when a batch
- * is full. It will perform the flush of all the entries currently stored
- * in a batch.
- *
- * Must be called from within some kind of spinlock/non-preempt region...
- */
-void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
-{
-	cpumask_t tmp;
-	int i, local = 0;
-
-	i = batch->index;
-	tmp = cpumask_of_cpu(smp_processor_id());
-	if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
-		local = 1;
-	if (i == 1)
-		flush_hash_page(batch->vaddr[0], batch->pte[0],
-				batch->psize, batch->ssize, local);
-	else
-		flush_hash_range(i, local);
-	batch->index = 0;
-}
-
-/**
- * __flush_hash_table_range - Flush all HPTEs for a given address range
- *                            from the hash table (and the TLB). But keeps
- *                            the linux PTEs intact.
- *
- * @mm		: mm_struct of the target address space (generally init_mm)
- * @start	: starting address
- * @end         : ending address (not included in the flush)
- *
- * This function is mostly to be used by some IO hotplug code in order
- * to remove all hash entries from a given address range used to map IO
- * space on a removed PCI-PCI bidge without tearing down the full mapping
- * since 64K pages may overlap with other bridges when using 64K pages
- * with 4K HW pages on IO space.
- *
- * Because of that usage pattern, it's only available with CONFIG_HOTPLUG
- * and is implemented for small size rather than speed.
- */
-#ifdef CONFIG_HOTPLUG
-
-void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
-			      unsigned long end)
-{
-	unsigned long flags;
-
-	start = _ALIGN_DOWN(start, PAGE_SIZE);
-	end = _ALIGN_UP(end, PAGE_SIZE);
-
-	BUG_ON(!mm->pgd);
-
-	/* Note: Normally, we should only ever use a batch within a
-	 * PTE locked section. This violates the rule, but will work
-	 * since we don't actually modify the PTEs, we just flush the
-	 * hash while leaving the PTEs intact (including their reference
-	 * to being hashed). This is not the most performance oriented
-	 * way to do things but is fine for our needs here.
-	 */
-	local_irq_save(flags);
-	arch_enter_lazy_mmu_mode();
-	for (; start < end; start += PAGE_SIZE) {
-		pte_t *ptep = find_linux_pte(mm->pgd, start);
-		unsigned long pte;
-
-		if (ptep == NULL)
-			continue;
-		pte = pte_val(*ptep);
-		if (!(pte & _PAGE_HASHPTE))
-			continue;
-		hpte_need_flush(mm, start, ptep, pte, 0);
-	}
-	arch_leave_lazy_mmu_mode();
-	local_irq_restore(flags);
-}
-
-#endif /* CONFIG_HOTPLUG */
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
new file mode 100644
index 000000000000..f9a47fee3927
--- /dev/null
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -0,0 +1,190 @@
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU uses a hash table to store virtual to
+ * physical translations, these routines flush entries from the
+ * hash table also.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#include "mmu_decl.h"
+
+/*
+ * Called when unmapping pages to flush entries from the TLB/hash table.
+ */
+void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
+{
+	unsigned long ptephys;
+
+	if (Hash != 0) {
+		ptephys = __pa(ptep) & PAGE_MASK;
+		flush_hash_pages(mm->context.id, addr, ptephys, 1);
+	}
+}
+EXPORT_SYMBOL(flush_hash_entry);
+
+/*
+ * Called by ptep_set_access_flags, must flush on CPUs for which the
+ * DSI handler can't just "fixup" the TLB on a write fault
+ */
+void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr)
+{
+	if (Hash != 0)
+		return;
+	_tlbie(addr);
+}
+
+/*
+ * Called at the end of a mmu_gather operation to make sure the
+ * TLB flush is completely done.
+ */
+void tlb_flush(struct mmu_gather *tlb)
+{
+	if (Hash == 0) {
+		/*
+		 * 603 needs to flush the whole TLB here since
+		 * it doesn't use a hash table.
+		 */
+		_tlbia();
+	}
+}
+
+/*
+ * TLB flushing:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * since the hardware hash table functions as an extension of the
+ * tlb as far as the linux tables are concerned, flush it too.
+ *    -- Cort
+ */
+
+/*
+ * 750 SMP is a Bad Idea because the 750 doesn't broadcast all
+ * the cache operations on the bus.  Hence we need to use an IPI
+ * to get the other CPU(s) to invalidate their TLBs.
+ */
+#ifdef CONFIG_SMP_750
+#define FINISH_FLUSH	smp_send_tlb_invalidate(0)
+#else
+#define FINISH_FLUSH	do { } while (0)
+#endif
+
+static void flush_range(struct mm_struct *mm, unsigned long start,
+			unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long pmd_end;
+	int count;
+	unsigned int ctx = mm->context.id;
+
+	if (Hash == 0) {
+		_tlbia();
+		return;
+	}
+	start &= PAGE_MASK;
+	if (start >= end)
+		return;
+	end = (end - 1) | ~PAGE_MASK;
+	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
+	for (;;) {
+		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
+		if (pmd_end > end)
+			pmd_end = end;
+		if (!pmd_none(*pmd)) {
+			count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
+			flush_hash_pages(ctx, start, pmd_val(*pmd), count);
+		}
+		if (pmd_end == end)
+			break;
+		start = pmd_end + 1;
+		++pmd;
+	}
+}
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	flush_range(&init_mm, start, end);
+	FINISH_FLUSH;
+}
+
+/*
+ * Flush all the (user) entries for the address space described by mm.
+ */
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	struct vm_area_struct *mp;
+
+	if (Hash == 0) {
+		_tlbia();
+		return;
+	}
+
+	/*
+	 * It is safe to go down the mm's list of vmas when called
+	 * from dup_mmap, holding mmap_sem.  It would also be safe from
+	 * unmap_region or exit_mmap, but not from vmtruncate on SMP -
+	 * but it seems dup_mmap is the only SMP case which gets here.
+	 */
+	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
+	FINISH_FLUSH;
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	struct mm_struct *mm;
+	pmd_t *pmd;
+
+	if (Hash == 0) {
+		_tlbie(vmaddr);
+		return;
+	}
+	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
+	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
+	if (!pmd_none(*pmd))
+		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
+	FINISH_FLUSH;
+}
+
+/*
+ * For each address in the range, find the pte for the address
+ * and check _PAGE_HASHPTE bit; if it is set, find and destroy
+ * the corresponding HPTE.
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+{
+	flush_range(vma->vm_mm, start, end);
+	FINISH_FLUSH;
+}
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
new file mode 100644
index 000000000000..c931bc7d1079
--- /dev/null
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -0,0 +1,211 @@
+/*
+ * This file contains the routines for flushing entries from the
+ * TLB and MMU hash table.
+ *
+ *  Derived from arch/ppc64/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/bug.h>
+
+DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
+
+/* This is declared as we are using the more or less generic
+ * arch/powerpc/include/asm/tlb.h file -- tgall
+ */
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+/*
+ * A linux PTE was changed and the corresponding hash table entry
+ * neesd to be flushed. This function will either perform the flush
+ * immediately or will batch it up if the current CPU has an active
+ * batch on it.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
+void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, unsigned long pte, int huge)
+{
+	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	unsigned long vsid, vaddr;
+	unsigned int psize;
+	int ssize;
+	real_pte_t rpte;
+	int i;
+
+	i = batch->index;
+
+	/* We mask the address for the base page size. Huge pages will
+	 * have applied their own masking already
+	 */
+	addr &= PAGE_MASK;
+
+	/* Get page size (maybe move back to caller).
+	 *
+	 * NOTE: when using special 64K mappings in 4K environment like
+	 * for SPEs, we obtain the page size from the slice, which thus
+	 * must still exist (and thus the VMA not reused) at the time
+	 * of this call
+	 */
+	if (huge) {
+#ifdef CONFIG_HUGETLB_PAGE
+		psize = get_slice_psize(mm, addr);;
+#else
+		BUG();
+		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
+#endif
+	} else
+		psize = pte_pagesize_index(mm, addr, pte);
+
+	/* Build full vaddr */
+	if (!is_kernel_addr(addr)) {
+		ssize = user_segment_size(addr);
+		vsid = get_vsid(mm->context.id, addr, ssize);
+		WARN_ON(vsid == 0);
+	} else {
+		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+		ssize = mmu_kernel_ssize;
+	}
+	vaddr = hpt_va(addr, vsid, ssize);
+	rpte = __real_pte(__pte(pte), ptep);
+
+	/*
+	 * Check if we have an active batch on this CPU. If not, just
+	 * flush now and return. For now, we don global invalidates
+	 * in that case, might be worth testing the mm cpu mask though
+	 * and decide to use local invalidates instead...
+	 */
+	if (!batch->active) {
+		flush_hash_page(vaddr, rpte, psize, ssize, 0);
+		return;
+	}
+
+	/*
+	 * This can happen when we are in the middle of a TLB batch and
+	 * we encounter memory pressure (eg copy_page_range when it tries
+	 * to allocate a new pte). If we have to reclaim memory and end
+	 * up scanning and resetting referenced bits then our batch context
+	 * will change mid stream.
+	 *
+	 * We also need to ensure only one page size is present in a given
+	 * batch
+	 */
+	if (i != 0 && (mm != batch->mm || batch->psize != psize ||
+		       batch->ssize != ssize)) {
+		__flush_tlb_pending(batch);
+		i = 0;
+	}
+	if (i == 0) {
+		batch->mm = mm;
+		batch->psize = psize;
+		batch->ssize = ssize;
+	}
+	batch->pte[i] = rpte;
+	batch->vaddr[i] = vaddr;
+	batch->index = ++i;
+	if (i >= PPC64_TLB_BATCH_NR)
+		__flush_tlb_pending(batch);
+}
+
+/*
+ * This function is called when terminating an mmu batch or when a batch
+ * is full. It will perform the flush of all the entries currently stored
+ * in a batch.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
+void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
+{
+	cpumask_t tmp;
+	int i, local = 0;
+
+	i = batch->index;
+	tmp = cpumask_of_cpu(smp_processor_id());
+	if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
+		local = 1;
+	if (i == 1)
+		flush_hash_page(batch->vaddr[0], batch->pte[0],
+				batch->psize, batch->ssize, local);
+	else
+		flush_hash_range(i, local);
+	batch->index = 0;
+}
+
+/**
+ * __flush_hash_table_range - Flush all HPTEs for a given address range
+ *                            from the hash table (and the TLB). But keeps
+ *                            the linux PTEs intact.
+ *
+ * @mm		: mm_struct of the target address space (generally init_mm)
+ * @start	: starting address
+ * @end         : ending address (not included in the flush)
+ *
+ * This function is mostly to be used by some IO hotplug code in order
+ * to remove all hash entries from a given address range used to map IO
+ * space on a removed PCI-PCI bidge without tearing down the full mapping
+ * since 64K pages may overlap with other bridges when using 64K pages
+ * with 4K HW pages on IO space.
+ *
+ * Because of that usage pattern, it's only available with CONFIG_HOTPLUG
+ * and is implemented for small size rather than speed.
+ */
+#ifdef CONFIG_HOTPLUG
+
+void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
+			      unsigned long end)
+{
+	unsigned long flags;
+
+	start = _ALIGN_DOWN(start, PAGE_SIZE);
+	end = _ALIGN_UP(end, PAGE_SIZE);
+
+	BUG_ON(!mm->pgd);
+
+	/* Note: Normally, we should only ever use a batch within a
+	 * PTE locked section. This violates the rule, but will work
+	 * since we don't actually modify the PTEs, we just flush the
+	 * hash while leaving the PTEs intact (including their reference
+	 * to being hashed). This is not the most performance oriented
+	 * way to do things but is fine for our needs here.
+	 */
+	local_irq_save(flags);
+	arch_enter_lazy_mmu_mode();
+	for (; start < end; start += PAGE_SIZE) {
+		pte_t *ptep = find_linux_pte(mm->pgd, start);
+		unsigned long pte;
+
+		if (ptep == NULL)
+			continue;
+		pte = pte_val(*ptep);
+		if (!(pte & _PAGE_HASHPTE))
+			continue;
+		hpte_need_flush(mm, start, ptep, pte, 0);
+	}
+	arch_leave_lazy_mmu_mode();
+	local_irq_restore(flags);
+}
+
+#endif /* CONFIG_HOTPLUG */
-- 
cgit v1.2.3


From f63837f0581fe580168ae1a7d178ded935411747 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 14 Dec 2008 19:44:51 +0000
Subject: powerpc/mm: Remove flush_HPTE()

The function flush_HPTE() is used in only one place, the implementation
of DEBUG_PAGEALLOC on ppc32.

It's actually a dup of flush_tlb_page() though it's -slightly- more
efficient on hash based processors.  We remove it and replace it by
a direct call to the hash flush code on those processors and to
flush_tlb_page() for everybody else.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/mmu_decl.h   | 17 -----------------
 arch/powerpc/mm/pgtable_32.c |  6 +++++-
 2 files changed, 5 insertions(+), 18 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index fab3cfad4099..b4344fd30f2a 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -58,17 +58,14 @@ extern phys_addr_t lowmem_end_addr;
  * architectures.  -- Dan
  */
 #if defined(CONFIG_8xx)
-#define flush_HPTE(X, va, pg)	_tlbie(va, 0 /* 8xx doesn't care about PID */)
 #define MMU_init_hw()		do { } while(0)
 #define mmu_mapin_ram()		(0UL)
 
 #elif defined(CONFIG_4xx)
-#define flush_HPTE(pid, va, pg)	_tlbie(va, pid)
 extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(void);
 
 #elif defined(CONFIG_FSL_BOOKE)
-#define flush_HPTE(pid, va, pg)	_tlbie(va, pid)
 extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(void);
 extern void adjust_total_lowmem(void);
@@ -77,18 +74,4 @@ extern void adjust_total_lowmem(void);
 /* anything 32-bit except 4xx or 8xx */
 extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(void);
-
-/* Be careful....this needs to be updated if we ever encounter 603 SMPs,
- * which includes all new 82xx processors.  We need tlbie/tlbsync here
- * in that case (I think). -- Dan.
- */
-static inline void flush_HPTE(unsigned context, unsigned long va,
-			      unsigned long pdval)
-{
-	if ((Hash != 0) &&
-	    cpu_has_feature(CPU_FTR_HPTE_TABLE))
-		flush_hash_pages(0, va, pdval, 1);
-	else
-		_tlbie(va);
-}
 #endif
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index c7b755cba26a..341472440137 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -342,7 +342,11 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
 		return -EINVAL;
 	set_pte_at(&init_mm, address, kpte, mk_pte(page, prot));
 	wmb();
-	flush_HPTE(0, address, pmd_val(*kpmd));
+#ifdef CONFIG_PPC_STD_MMU
+	flush_hash_pages(0, address, pmd_val(*kpmd), 1);
+#else
+	flush_tlb_page(NULL, address);
+#endif
 	pte_unmap(kpte);
 
 	return 0;
-- 
cgit v1.2.3


From 5e696617c425eb97bd943d781f3941fb1e8f0e5b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:24 +0000
Subject: powerpc/mm: Split mmu_context handling

This splits the mmu_context handling between 32-bit hash based
processors, 64-bit hash based processors and everybody else.  This is
preliminary work for adding SMP support for BookE processors.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/mmu_context.h       | 260 ++++-----------------------
 arch/powerpc/kernel/asm-offsets.c            |   1 +
 arch/powerpc/kernel/head_32.S                |  12 +-
 arch/powerpc/kernel/ppc_ksyms.c              |   3 +-
 arch/powerpc/kernel/swsusp.c                 |   2 +-
 arch/powerpc/mm/Makefile                     |   7 +-
 arch/powerpc/mm/mmu_context_32.c             |  84 ---------
 arch/powerpc/mm/mmu_context_64.c             |  70 --------
 arch/powerpc/mm/mmu_context_hash32.c         | 103 +++++++++++
 arch/powerpc/mm/mmu_context_hash64.c         |  78 ++++++++
 arch/powerpc/mm/mmu_context_nohash.c         | 162 +++++++++++++++++
 arch/powerpc/platforms/Kconfig.cputype       |  10 +-
 arch/powerpc/platforms/powermac/cpufreq_32.c |   2 +-
 drivers/macintosh/via-pmu.c                  |   4 +-
 14 files changed, 407 insertions(+), 391 deletions(-)
 delete mode 100644 arch/powerpc/mm/mmu_context_32.c
 delete mode 100644 arch/powerpc/mm/mmu_context_64.c
 create mode 100644 arch/powerpc/mm/mmu_context_hash32.c
 create mode 100644 arch/powerpc/mm/mmu_context_hash64.c
 create mode 100644 arch/powerpc/mm/mmu_context_nohash.c

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b570209b71a8..ab4f19263c42 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -2,240 +2,26 @@
 #define __ASM_POWERPC_MMU_CONTEXT_H
 #ifdef __KERNEL__
 
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
 #include <asm/mmu.h>	
 #include <asm/cputable.h>
 #include <asm-generic/mm_hooks.h>
-
-#ifndef CONFIG_PPC64
-#include <asm/atomic.h>
-#include <linux/bitops.h>
-
-/*
- * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
- * (virtual segment identifiers) for each context.  Although the
- * hardware supports 24-bit VSIDs, and thus >1 million contexts,
- * we only use 32,768 of them.  That is ample, since there can be
- * at most around 30,000 tasks in the system anyway, and it means
- * that we can use a bitmap to indicate which contexts are in use.
- * Using a bitmap means that we entirely avoid all of the problems
- * that we used to have when the context number overflowed,
- * particularly on SMP systems.
- *  -- paulus.
- */
-
-/*
- * This function defines the mapping from contexts to VSIDs (virtual
- * segment IDs).  We use a skew on both the context and the high 4 bits
- * of the 32-bit virtual address (the "effective segment ID") in order
- * to spread out the entries in the MMU hash table.  Note, if this
- * function is changed then arch/ppc/mm/hashtable.S will have to be
- * changed to correspond.
- */
-#define CTX_TO_VSID(ctx, va)	(((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \
-				 & 0xffffff)
-
-/*
-   The MPC8xx has only 16 contexts.  We rotate through them on each
-   task switch.  A better way would be to keep track of tasks that
-   own contexts, and implement an LRU usage.  That way very active
-   tasks don't always have to pay the TLB reload overhead.  The
-   kernel pages are mapped shared, so the kernel can run on behalf
-   of any task that makes a kernel entry.  Shared does not mean they
-   are not protected, just that the ASID comparison is not performed.
-        -- Dan
-
-   The IBM4xx has 256 contexts, so we can just rotate through these
-   as a way of "switching" contexts.  If the TID of the TLB is zero,
-   the PID/TID comparison is disabled, so we can use a TID of zero
-   to represent all kernel pages as shared among all contexts.
-   	-- Dan
- */
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
-#ifdef CONFIG_8xx
-#define NO_CONTEXT      	16
-#define LAST_CONTEXT    	15
-#define FIRST_CONTEXT    	0
-
-#elif defined(CONFIG_4xx)
-#define NO_CONTEXT      	256
-#define LAST_CONTEXT    	255
-#define FIRST_CONTEXT    	1
-
-#elif defined(CONFIG_E200) || defined(CONFIG_E500)
-#define NO_CONTEXT      	256
-#define LAST_CONTEXT    	255
-#define FIRST_CONTEXT    	1
-
-#else
-
-/* PPC 6xx, 7xx CPUs */
-#define NO_CONTEXT      	((unsigned long) -1)
-#define LAST_CONTEXT    	32767
-#define FIRST_CONTEXT    	1
-#endif
-
-/*
- * Set the current MMU context.
- * On 32-bit PowerPCs (other than the 8xx embedded chips), this is done by
- * loading up the segment registers for the user part of the address space.
- *
- * Since the PGD is immediately available, it is much faster to simply
- * pass this along as a second parameter, which is required for 8xx and
- * can be used for debugging on all processors (if you happen to have
- * an Abatron).
- */
-extern void set_context(unsigned long contextid, pgd_t *pgd);
-
-/*
- * Bitmap of contexts in use.
- * The size of this bitmap is LAST_CONTEXT + 1 bits.
- */
-extern unsigned long context_map[];
-
-/*
- * This caches the next context number that we expect to be free.
- * Its use is an optimization only, we can't rely on this context
- * number to be free, but it usually will be.
- */
-extern unsigned long next_mmu_context;
-
-/*
- * If we don't have sufficient contexts to give one to every task
- * that could be in the system, we need to be able to steal contexts.
- * These variables support that.
- */
-#if LAST_CONTEXT < 30000
-#define FEW_CONTEXTS	1
-extern atomic_t nr_free_contexts;
-extern struct mm_struct *context_mm[LAST_CONTEXT+1];
-extern void steal_context(void);
-#endif
-
-/*
- * Get a new mmu context for the address space described by `mm'.
- */
-static inline void get_mmu_context(struct mm_struct *mm)
-{
-	unsigned long ctx;
-
-	if (mm->context.id != NO_CONTEXT)
-		return;
-#ifdef FEW_CONTEXTS
-	while (atomic_dec_if_positive(&nr_free_contexts) < 0)
-		steal_context();
-#endif
-	ctx = next_mmu_context;
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
-	}
-	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
-	mm->context.id = ctx;
-#ifdef FEW_CONTEXTS
-	context_mm[ctx] = mm;
-#endif
-}
+#include <asm/cputhreads.h>
 
 /*
- * Set up the context for a new address space.
+ * Most if the context management is out of line
  */
-static inline int init_new_context(struct task_struct *t, struct mm_struct *mm)
-{
-	mm->context.id = NO_CONTEXT;
-	return 0;
-}
-
-/*
- * We're finished using the context for an address space.
- */
-static inline void destroy_context(struct mm_struct *mm)
-{
-	preempt_disable();
-	if (mm->context.id != NO_CONTEXT) {
-		clear_bit(mm->context.id, context_map);
-		mm->context.id = NO_CONTEXT;
-#ifdef FEW_CONTEXTS
-		atomic_inc(&nr_free_contexts);
-#endif
-	}
-	preempt_enable();
-}
-
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-			     struct task_struct *tsk)
-{
-#ifdef CONFIG_ALTIVEC
-	if (cpu_has_feature(CPU_FTR_ALTIVEC))
-	asm volatile ("dssall;\n"
-#ifndef CONFIG_POWER4
-	 "sync;\n" /* G4 needs a sync here, G5 apparently not */
-#endif
-	 : : );
-#endif /* CONFIG_ALTIVEC */
-
-	tsk->thread.pgdir = next->pgd;
-
-	if (!cpu_isset(smp_processor_id(), next->cpu_vm_mask))
-		cpu_set(smp_processor_id(), next->cpu_vm_mask);
-
-	/* No need to flush userspace segments if the mm doesnt change */
-	if (prev == next)
-		return;
-
-	/* Setup new userspace context */
-	get_mmu_context(next);
-	set_context(next->context.id, next->pgd);
-}
-
-#define deactivate_mm(tsk,mm)	do { } while (0)
-
-/*
- * After we have set current->mm to a new value, this activates
- * the context for the new mm so we see the new mappings.
- */
-#define activate_mm(active_mm, mm)   switch_mm(active_mm, mm, current)
-
 extern void mmu_context_init(void);
-
-
-#else
-
-#include <linux/kernel.h>	
-#include <linux/mm.h>	
-#include <linux/sched.h>
-
-/*
- * Copyright (C) 2001 PPC 64 Team, IBM Corp
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-static inline void enter_lazy_tlb(struct mm_struct *mm,
-				  struct task_struct *tsk)
-{
-}
-
-/*
- * The proto-VSID space has 2^35 - 1 segments available for user mappings.
- * Each segment contains 2^28 bytes.  Each context maps 2^44 bytes,
- * so we can support 2^19-1 contexts (19 == 35 + 28 - 44).
- */
-#define NO_CONTEXT	0
-#define MAX_CONTEXT	((1UL << 19) - 1)
-
 extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 extern void destroy_context(struct mm_struct *mm);
 
+extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
 extern void switch_stab(struct task_struct *tsk, struct mm_struct *mm);
 extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
+extern void set_context(unsigned long id, pgd_t *pgd);
 
 /*
  * switch_mm is the entry point called from the architecture independent
@@ -244,22 +30,39 @@ extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
 {
-	if (!cpu_isset(smp_processor_id(), next->cpu_vm_mask))
-		cpu_set(smp_processor_id(), next->cpu_vm_mask);
+	/* Mark this context has been used on the new CPU */
+	cpu_set(smp_processor_id(), next->cpu_vm_mask);
+
+	/* 32-bit keeps track of the current PGDIR in the thread struct */
+#ifdef CONFIG_PPC32
+	tsk->thread.pgdir = next->pgd;
+#endif /* CONFIG_PPC32 */
 
-	/* No need to flush userspace segments if the mm doesnt change */
+	/* Nothing else to do if we aren't actually switching */
 	if (prev == next)
 		return;
 
+	/* We must stop all altivec streams before changing the HW
+	 * context
+	 */
 #ifdef CONFIG_ALTIVEC
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
 		asm volatile ("dssall");
 #endif /* CONFIG_ALTIVEC */
 
+	/* The actual HW switching method differs between the various
+	 * sub architectures.
+	 */
+#ifdef CONFIG_PPC_STD_MMU_64
 	if (cpu_has_feature(CPU_FTR_SLB))
 		switch_slb(tsk, next);
 	else
 		switch_stab(tsk, next);
+#else
+	/* Out of line for now */
+	switch_mmu_context(prev, next);
+#endif
+
 }
 
 #define deactivate_mm(tsk,mm)	do { } while (0)
@@ -277,6 +80,11 @@ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 	local_irq_restore(flags);
 }
 
-#endif /* CONFIG_PPC64 */
+/* We don't currently use enter_lazy_tlb() for anything */
+static inline void enter_lazy_tlb(struct mm_struct *mm,
+				  struct task_struct *tsk)
+{
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 050abfd5c17c..c05ab1d3e620 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -60,6 +60,7 @@ int main(void)
 {
 	DEFINE(THREAD, offsetof(struct task_struct, thread));
 	DEFINE(MM, offsetof(struct task_struct, mm));
+	DEFINE(MMCONTEXTID, offsetof(struct mm_struct, context.id));
 #ifdef CONFIG_PPC64
 	DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context));
 #else
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 0c326823c6d4..be9f9e5470e8 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -31,6 +31,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/ptrace.h>
+#include <asm/bug.h>
 
 /* 601 only have IBAT; cr0.eq is set on 601 when using this macro */
 #define LOAD_BAT(n, reg, RA, RB)	\
@@ -1070,9 +1071,14 @@ start_here:
 	RFI
 
 /*
+ * void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
+ *
  * Set up the segment registers for a new context.
  */
-_ENTRY(set_context)
+_ENTRY(switch_mmu_context)
+	lwz	r3,MMCONTEXTID(r4)
+	cmpwi	cr0,r3,0
+	blt-	4f
 	mulli	r3,r3,897	/* multiply context by skew factor */
 	rlwinm	r3,r3,4,8,27	/* VSID = (context & 0xfffff) << 4 */
 	addis	r3,r3,0x6000	/* Set Ks, Ku bits */
@@ -1083,6 +1089,7 @@ _ENTRY(set_context)
 	/* Context switch the PTE pointer for the Abatron BDI2000.
 	 * The PGDIR is passed as second argument.
 	 */
+	lwz	r4,MM_PGD(r4)
 	lis	r5, KERNELBASE@h
 	lwz	r5, 0xf0(r5)
 	stw	r4, 0x4(r5)
@@ -1098,6 +1105,9 @@ _ENTRY(set_context)
 	sync
 	isync
 	blr
+4:	trap
+	EMIT_BUG_ENTRY 4b,__FILE__,__LINE__,0
+	blr
 
 /*
  * An undocumented "feature" of 604e requires that the v bit
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 260089dccfb0..341b3d3048e0 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -174,8 +174,7 @@ EXPORT_SYMBOL(cacheable_memcpy);
 #endif
 
 #ifdef CONFIG_PPC32
-EXPORT_SYMBOL(next_mmu_context);
-EXPORT_SYMBOL(set_context);
+EXPORT_SYMBOL(switch_mmu_context);
 #endif
 
 #ifdef CONFIG_PPC_STD_MMU_32
diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c
index 77b7b34b5955..560c96119501 100644
--- a/arch/powerpc/kernel/swsusp.c
+++ b/arch/powerpc/kernel/swsusp.c
@@ -34,6 +34,6 @@ void save_processor_state(void)
 void restore_processor_state(void)
 {
 #ifdef CONFIG_PPC32
-	set_context(current->active_mm->context.id, current->active_mm->pgd);
+	switch_mmu_context(NULL, current->active_mm);
 #endif
 }
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 148de35c9eef..923bd3fa7d64 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -8,15 +8,16 @@ endif
 
 obj-y				:= fault.o mem.o pgtable.o \
 				   init_$(CONFIG_WORD_SIZE).o \
-				   pgtable_$(CONFIG_WORD_SIZE).o \
-				   mmu_context_$(CONFIG_WORD_SIZE).o
+				   pgtable_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o
 hash-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
 				   slb_low.o slb.o stab.o \
 				   gup.o mmap.o $(hash-y)
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
-				   tlb_hash$(CONFIG_WORD_SIZE).o
+				   tlb_hash$(CONFIG_WORD_SIZE).o \
+				   mmu_context_hash$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_40x)		+= 40x_mmu.o
 obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)		+= fsl_booke_mmu.o
diff --git a/arch/powerpc/mm/mmu_context_32.c b/arch/powerpc/mm/mmu_context_32.c
deleted file mode 100644
index cc32ba41d900..000000000000
--- a/arch/powerpc/mm/mmu_context_32.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * This file contains the routines for handling the MMU on those
- * PowerPC implementations where the MMU substantially follows the
- * architecture specification.  This includes the 6xx, 7xx, 7xxx,
- * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-
-#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
-
-unsigned long next_mmu_context;
-unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-#ifdef FEW_CONTEXTS
-atomic_t nr_free_contexts;
-struct mm_struct *context_mm[LAST_CONTEXT+1];
-void steal_context(void);
-#endif /* FEW_CONTEXTS */
-
-/*
- * Initialize the context management stuff.
- */
-void __init
-mmu_context_init(void)
-{
-	/*
-	 * Some processors have too few contexts to reserve one for
-	 * init_mm, and require using context 0 for a normal task.
-	 * Other processors reserve the use of context zero for the kernel.
-	 * This code assumes FIRST_CONTEXT < 32.
-	 */
-	context_map[0] = (1 << FIRST_CONTEXT) - 1;
-	next_mmu_context = FIRST_CONTEXT;
-#ifdef FEW_CONTEXTS
-	atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
-#endif /* FEW_CONTEXTS */
-}
-
-#ifdef FEW_CONTEXTS
-/*
- * Steal a context from a task that has one at the moment.
- * This is only used on 8xx and 4xx and we presently assume that
- * they don't do SMP.  If they do then this will have to check
- * whether the MM we steal is in use.
- * We also assume that this is only used on systems that don't
- * use an MMU hash table - this is true for 8xx and 4xx.
- * This isn't an LRU system, it just frees up each context in
- * turn (sort-of pseudo-random replacement :).  This would be the
- * place to implement an LRU scheme if anyone was motivated to do it.
- *  -- paulus
- */
-void
-steal_context(void)
-{
-	struct mm_struct *mm;
-
-	/* free up context `next_mmu_context' */
-	/* if we shouldn't free context 0, don't... */
-	if (next_mmu_context < FIRST_CONTEXT)
-		next_mmu_context = FIRST_CONTEXT;
-	mm = context_mm[next_mmu_context];
-	flush_tlb_mm(mm);
-	destroy_context(mm);
-}
-#endif /* FEW_CONTEXTS */
diff --git a/arch/powerpc/mm/mmu_context_64.c b/arch/powerpc/mm/mmu_context_64.c
deleted file mode 100644
index 1db38ba1f544..000000000000
--- a/arch/powerpc/mm/mmu_context_64.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  MMU context allocation for 64-bit kernels.
- *
- *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-
-#include <asm/mmu_context.h>
-
-static DEFINE_SPINLOCK(mmu_context_lock);
-static DEFINE_IDR(mmu_context_idr);
-
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-	int index;
-	int err;
-
-again:
-	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
-		return -ENOMEM;
-
-	spin_lock(&mmu_context_lock);
-	err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
-	spin_unlock(&mmu_context_lock);
-
-	if (err == -EAGAIN)
-		goto again;
-	else if (err)
-		return err;
-
-	if (index > MAX_CONTEXT) {
-		spin_lock(&mmu_context_lock);
-		idr_remove(&mmu_context_idr, index);
-		spin_unlock(&mmu_context_lock);
-		return -ENOMEM;
-	}
-
-	/* The old code would re-promote on fork, we don't do that
-	 * when using slices as it could cause problem promoting slices
-	 * that have been forced down to 4K
-	 */
-	if (slice_mm_new_context(mm))
-		slice_set_user_psize(mm, mmu_virtual_psize);
-	mm->context.id = index;
-
-	return 0;
-}
-
-void destroy_context(struct mm_struct *mm)
-{
-	spin_lock(&mmu_context_lock);
-	idr_remove(&mmu_context_idr, mm->context.id);
-	spin_unlock(&mmu_context_lock);
-
-	mm->context.id = NO_CONTEXT;
-}
diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c
new file mode 100644
index 000000000000..0dfba2bf7f31
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash32.c
@@ -0,0 +1,103 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification.  This includes the 6xx, 7xx, 7xxx,
+ * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+/*
+ * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
+ * (virtual segment identifiers) for each context.  Although the
+ * hardware supports 24-bit VSIDs, and thus >1 million contexts,
+ * we only use 32,768 of them.  That is ample, since there can be
+ * at most around 30,000 tasks in the system anyway, and it means
+ * that we can use a bitmap to indicate which contexts are in use.
+ * Using a bitmap means that we entirely avoid all of the problems
+ * that we used to have when the context number overflowed,
+ * particularly on SMP systems.
+ *  -- paulus.
+ */
+#define NO_CONTEXT      	((unsigned long) -1)
+#define LAST_CONTEXT    	32767
+#define FIRST_CONTEXT    	1
+
+/*
+ * This function defines the mapping from contexts to VSIDs (virtual
+ * segment IDs).  We use a skew on both the context and the high 4 bits
+ * of the 32-bit virtual address (the "effective segment ID") in order
+ * to spread out the entries in the MMU hash table.  Note, if this
+ * function is changed then arch/ppc/mm/hashtable.S will have to be
+ * changed to correspond.
+ *
+ *
+ * CTX_TO_VSID(ctx, va)	(((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \
+ *				 & 0xffffff)
+ */
+
+static unsigned long next_mmu_context;
+static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
+
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+	unsigned long ctx = next_mmu_context;
+
+	while (test_and_set_bit(ctx, context_map)) {
+		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
+		if (ctx > LAST_CONTEXT)
+			ctx = 0;
+	}
+	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
+	mm->context.id = ctx;
+
+	return 0;
+}
+
+/*
+ * We're finished using the context for an address space.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	preempt_disable();
+	if (mm->context.id != NO_CONTEXT) {
+		clear_bit(mm->context.id, context_map);
+		mm->context.id = NO_CONTEXT;
+	}
+	preempt_enable();
+}
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init mmu_context_init(void)
+{
+	/* Reserve context 0 for kernel use */
+	context_map[0] = (1 << FIRST_CONTEXT) - 1;
+	next_mmu_context = FIRST_CONTEXT;
+}
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
new file mode 100644
index 000000000000..dbeb86ac90cd
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -0,0 +1,78 @@
+/*
+ *  MMU context allocation for 64-bit kernels.
+ *
+ *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+
+#include <asm/mmu_context.h>
+
+static DEFINE_SPINLOCK(mmu_context_lock);
+static DEFINE_IDR(mmu_context_idr);
+
+/*
+ * The proto-VSID space has 2^35 - 1 segments available for user mappings.
+ * Each segment contains 2^28 bytes.  Each context maps 2^44 bytes,
+ * so we can support 2^19-1 contexts (19 == 35 + 28 - 44).
+ */
+#define NO_CONTEXT	0
+#define MAX_CONTEXT	((1UL << 19) - 1)
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	int index;
+	int err;
+
+again:
+	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
+		return -ENOMEM;
+
+	spin_lock(&mmu_context_lock);
+	err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
+	spin_unlock(&mmu_context_lock);
+
+	if (err == -EAGAIN)
+		goto again;
+	else if (err)
+		return err;
+
+	if (index > MAX_CONTEXT) {
+		spin_lock(&mmu_context_lock);
+		idr_remove(&mmu_context_idr, index);
+		spin_unlock(&mmu_context_lock);
+		return -ENOMEM;
+	}
+
+	/* The old code would re-promote on fork, we don't do that
+	 * when using slices as it could cause problem promoting slices
+	 * that have been forced down to 4K
+	 */
+	if (slice_mm_new_context(mm))
+		slice_set_user_psize(mm, mmu_virtual_psize);
+	mm->context.id = index;
+
+	return 0;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+	spin_lock(&mmu_context_lock);
+	idr_remove(&mmu_context_idr, mm->context.id);
+	spin_unlock(&mmu_context_lock);
+
+	mm->context.id = NO_CONTEXT;
+}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
new file mode 100644
index 000000000000..00e02150abef
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -0,0 +1,162 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU is not using the hash
+ * table, such as 8xx, 4xx, BookE's etc...
+ *
+ * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
+ *                IBM Corp.
+ *
+ *  Derived from previous arch/powerpc/mm/mmu_context.c
+ *  and arch/powerpc/include/asm/mmu_context.h
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+/*
+ *   The MPC8xx has only 16 contexts.  We rotate through them on each
+ * task switch.  A better way would be to keep track of tasks that
+ * own contexts, and implement an LRU usage.  That way very active
+ * tasks don't always have to pay the TLB reload overhead.  The
+ * kernel pages are mapped shared, so the kernel can run on behalf
+ * of any task that makes a kernel entry.  Shared does not mean they
+ * are not protected, just that the ASID comparison is not performed.
+ *      -- Dan
+ *
+ * The IBM4xx has 256 contexts, so we can just rotate through these
+ * as a way of "switching" contexts.  If the TID of the TLB is zero,
+ * the PID/TID comparison is disabled, so we can use a TID of zero
+ * to represent all kernel pages as shared among all contexts.
+ * 	-- Dan
+ */
+
+#ifdef CONFIG_8xx
+#define NO_CONTEXT      	16
+#define LAST_CONTEXT    	15
+#define FIRST_CONTEXT    	0
+
+#elif defined(CONFIG_4xx)
+#define NO_CONTEXT      	256
+#define LAST_CONTEXT    	255
+#define FIRST_CONTEXT    	1
+
+#elif defined(CONFIG_E200) || defined(CONFIG_E500)
+#define NO_CONTEXT      	256
+#define LAST_CONTEXT    	255
+#define FIRST_CONTEXT    	1
+
+#else
+#error Unsupported processor type
+#endif
+
+static unsigned long next_mmu_context;
+static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
+static atomic_t nr_free_contexts;
+static struct mm_struct *context_mm[LAST_CONTEXT+1];
+static void steal_context(void);
+
+/* Steal a context from a task that has one at the moment.
+ * This is only used on 8xx and 4xx and we presently assume that
+ * they don't do SMP.  If they do then this will have to check
+ * whether the MM we steal is in use.
+ * We also assume that this is only used on systems that don't
+ * use an MMU hash table - this is true for 8xx and 4xx.
+ * This isn't an LRU system, it just frees up each context in
+ * turn (sort-of pseudo-random replacement :).  This would be the
+ * place to implement an LRU scheme if anyone was motivated to do it.
+ *  -- paulus
+ */
+static void steal_context(void)
+{
+	struct mm_struct *mm;
+
+	/* free up context `next_mmu_context' */
+	/* if we shouldn't free context 0, don't... */
+	if (next_mmu_context < FIRST_CONTEXT)
+		next_mmu_context = FIRST_CONTEXT;
+	mm = context_mm[next_mmu_context];
+	flush_tlb_mm(mm);
+	destroy_context(mm);
+}
+
+
+/*
+ * Get a new mmu context for the address space described by `mm'.
+ */
+static inline void get_mmu_context(struct mm_struct *mm)
+{
+	unsigned long ctx;
+
+	if (mm->context.id != NO_CONTEXT)
+		return;
+
+	while (atomic_dec_if_positive(&nr_free_contexts) < 0)
+		steal_context();
+
+	ctx = next_mmu_context;
+	while (test_and_set_bit(ctx, context_map)) {
+		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
+		if (ctx > LAST_CONTEXT)
+			ctx = 0;
+	}
+	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
+	mm->context.id = ctx;
+	context_mm[ctx] = mm;
+}
+
+void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+{
+	get_mmu_context(next);
+
+	set_context(next->context.id, next->pgd);
+}
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+	mm->context.id = NO_CONTEXT;
+	return 0;
+}
+
+/*
+ * We're finished using the context for an address space.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	preempt_disable();
+	if (mm->context.id != NO_CONTEXT) {
+		clear_bit(mm->context.id, context_map);
+		mm->context.id = NO_CONTEXT;
+		atomic_inc(&nr_free_contexts);
+	}
+	preempt_enable();
+}
+
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init mmu_context_init(void)
+{
+	/*
+	 * Some processors have too few contexts to reserve one for
+	 * init_mm, and require using context 0 for a normal task.
+	 * Other processors reserve the use of context zero for the kernel.
+	 * This code assumes FIRST_CONTEXT < 32.
+	 */
+	context_map[0] = (1 << FIRST_CONTEXT) - 1;
+	next_mmu_context = FIRST_CONTEXT;
+	atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
+}
+
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 548efa55c8fe..db61dafb924d 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -195,13 +195,21 @@ config SPE
 
 config PPC_STD_MMU
 	bool
-	depends on 6xx || POWER3 || POWER4 || PPC64
+	depends on 6xx || PPC64
 	default y
 
 config PPC_STD_MMU_32
 	def_bool y
 	depends on PPC_STD_MMU && PPC32
 
+config PPC_STD_MMU_64
+	def_bool y
+	depends on PPC_STD_MMU && PPC64
+
+config PPC_MMU_NOHASH
+	def_bool y
+	depends on !PPC_STD_MMU
+
 config PPC_MM_SLICES
 	bool
 	default y if HUGETLB_PAGE || PPC_64K_PAGES
diff --git a/arch/powerpc/platforms/powermac/cpufreq_32.c b/arch/powerpc/platforms/powermac/cpufreq_32.c
index 792d3ce8112e..65c585b8b00d 100644
--- a/arch/powerpc/platforms/powermac/cpufreq_32.c
+++ b/arch/powerpc/platforms/powermac/cpufreq_32.c
@@ -310,7 +310,7 @@ static int pmu_set_cpu_speed(int low_speed)
  		_set_L3CR(save_l3cr);
 
 	/* Restore userland MMU context */
-	set_context(current->active_mm->context.id, current->active_mm->pgd);
+	switch_mmu_context(NULL, current->active_mm);
 
 #ifdef DEBUG_FREQ
 	printk(KERN_DEBUG "HID1, after: %x\n", mfspr(SPRN_HID1));
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index d524dc245a2c..b40fb9b6c862 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -1814,7 +1814,7 @@ static int powerbook_sleep_grackle(void)
  		_set_L2CR(save_l2cr);
 	
 	/* Restore userland MMU context */
-	set_context(current->active_mm->context.id, current->active_mm->pgd);
+	switch_mmu_context(NULL, current->active_mm);
 
 	/* Power things up */
 	pmu_unlock();
@@ -1903,7 +1903,7 @@ powerbook_sleep_Core99(void)
  		_set_L3CR(save_l3cr);
 	
 	/* Restore userland MMU context */
-	set_context(current->active_mm->context.id, current->active_mm->pgd);
+	switch_mmu_context(NULL, current->active_mm);
 
 	/* Tell PMU we are ready */
 	pmu_unlock();
-- 
cgit v1.2.3


From 2ca8cf738907180e7fbda90f25f32b86feda609f Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:29 +0000
Subject: powerpc/mm: Rework context management for CPUs with no hash table

This reworks the context management code used by 4xx,8xx and
freescale BookE.  It adds support for SMP by implementing a
concept of stale context map to lazily flush the TLB on
processors where a context may have been invalidated.  This
also contains the ground work for generalizing such lazy TLB
flushing by just picking up a new PID and marking the old one
stale.  This will be implemented later.

This is a first implementation that uses a global spinlock.

Ideally, we should try to get at least the fast path (context ID
already assigned) lockless or limited to a per context lock,
but for now this will do.

I tried to keep the UP case reasonably simple to avoid adding
too much overhead to 8xx which does a lot of context stealing
since it effectively has only 16 PIDs available.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/mmu-40x.h       |   5 +-
 arch/powerpc/include/asm/mmu-44x.h       |   5 +-
 arch/powerpc/include/asm/mmu-8xx.h       |   3 +-
 arch/powerpc/include/asm/mmu-fsl-booke.h |   5 +-
 arch/powerpc/include/asm/tlbflush.h      |   2 +
 arch/powerpc/mm/mmu_context_nohash.c     | 268 +++++++++++++++++++++++++------
 6 files changed, 234 insertions(+), 54 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/include/asm/mmu-40x.h b/arch/powerpc/include/asm/mmu-40x.h
index 3d108676584c..776f415a36aa 100644
--- a/arch/powerpc/include/asm/mmu-40x.h
+++ b/arch/powerpc/include/asm/mmu-40x.h
@@ -54,8 +54,9 @@
 #ifndef __ASSEMBLY__
 
 typedef struct {
-	unsigned long id;
-	unsigned long vdso_base;
+	unsigned int	id;
+	unsigned int	active;
+	unsigned long	vdso_base;
 } mm_context_t;
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/mmu-44x.h b/arch/powerpc/include/asm/mmu-44x.h
index a825524c981a..b21af32ac6d6 100644
--- a/arch/powerpc/include/asm/mmu-44x.h
+++ b/arch/powerpc/include/asm/mmu-44x.h
@@ -56,8 +56,9 @@
 extern unsigned int tlb_44x_hwater;
 
 typedef struct {
-	unsigned long id;
-	unsigned long vdso_base;
+	unsigned int	id;
+	unsigned int	active;
+	unsigned long	vdso_base;
 } mm_context_t;
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/mmu-8xx.h b/arch/powerpc/include/asm/mmu-8xx.h
index 9db877eb88db..07865a357848 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -137,7 +137,8 @@
 
 #ifndef __ASSEMBLY__
 typedef struct {
-	unsigned long id;
+	unsigned int id;
+	unsigned int active;
 	unsigned long vdso_base;
 } mm_context_t;
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/mmu-fsl-booke.h b/arch/powerpc/include/asm/mmu-fsl-booke.h
index 5588a41f439c..3f941c0f7e8e 100644
--- a/arch/powerpc/include/asm/mmu-fsl-booke.h
+++ b/arch/powerpc/include/asm/mmu-fsl-booke.h
@@ -76,8 +76,9 @@
 #ifndef __ASSEMBLY__
 
 typedef struct {
-	unsigned long id;
-	unsigned long vdso_base;
+	unsigned int	id;
+	unsigned int	active;
+	unsigned long	vdso_base;
 } mm_context_t;
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h
index 333c24b54379..9ed363d3de44 100644
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -29,6 +29,8 @@
 
 #include <linux/mm.h>
 
+#define MMU_NO_CONTEXT      	((unsigned int)-1)
+
 extern void _tlbie(unsigned long address, unsigned int pid);
 extern void _tlbil_all(void);
 extern void _tlbil_pid(unsigned int pid);
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 00e02150abef..8b5de52de0ad 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -14,13 +14,28 @@
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  *
+ * TODO:
+ *
+ *   - The global context lock will not scale very well
+ *   - The maps should be dynamically allocated to allow for processors
+ *     that support more PID bits at runtime
+ *   - Implement flush_tlb_mm() by making the context stale and picking
+ *     a new one
+ *   - More aggressively clear stale map bits and maybe find some way to
+ *     also clear mm->cpu_vm_mask bits when processes are migrated
  */
 
+#undef DEBUG
+#define DEBUG_STEAL_ONLY
+#undef DEBUG_MAP_CONSISTENCY
+
+#include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
+#include <linux/spinlock.h>
 
 /*
  *   The MPC8xx has only 16 contexts.  We rotate through them on each
@@ -40,17 +55,14 @@
  */
 
 #ifdef CONFIG_8xx
-#define NO_CONTEXT      	16
 #define LAST_CONTEXT    	15
 #define FIRST_CONTEXT    	0
 
 #elif defined(CONFIG_4xx)
-#define NO_CONTEXT      	256
 #define LAST_CONTEXT    	255
 #define FIRST_CONTEXT    	1
 
 #elif defined(CONFIG_E200) || defined(CONFIG_E500)
-#define NO_CONTEXT      	256
 #define LAST_CONTEXT    	255
 #define FIRST_CONTEXT    	1
 
@@ -58,66 +70,208 @@
 #error Unsupported processor type
 #endif
 
-static unsigned long next_mmu_context;
+static unsigned int next_context, nr_free_contexts;
 static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-static atomic_t nr_free_contexts;
+static unsigned long stale_map[NR_CPUS][LAST_CONTEXT / BITS_PER_LONG + 1];
 static struct mm_struct *context_mm[LAST_CONTEXT+1];
-static void steal_context(void);
+static spinlock_t context_lock = SPIN_LOCK_UNLOCKED;
 
 /* Steal a context from a task that has one at the moment.
- * This is only used on 8xx and 4xx and we presently assume that
- * they don't do SMP.  If they do then this will have to check
- * whether the MM we steal is in use.
- * We also assume that this is only used on systems that don't
- * use an MMU hash table - this is true for 8xx and 4xx.
+ *
+ * This is used when we are running out of available PID numbers
+ * on the processors.
+ *
  * This isn't an LRU system, it just frees up each context in
  * turn (sort-of pseudo-random replacement :).  This would be the
  * place to implement an LRU scheme if anyone was motivated to do it.
  *  -- paulus
+ *
+ * For context stealing, we use a slightly different approach for
+ * SMP and UP. Basically, the UP one is simpler and doesn't use
+ * the stale map as we can just flush the local CPU
+ *  -- benh
  */
-static void steal_context(void)
+#ifdef CONFIG_SMP
+static unsigned int steal_context_smp(unsigned int id)
 {
 	struct mm_struct *mm;
+	unsigned int cpu, max;
 
-	/* free up context `next_mmu_context' */
-	/* if we shouldn't free context 0, don't... */
-	if (next_mmu_context < FIRST_CONTEXT)
-		next_mmu_context = FIRST_CONTEXT;
-	mm = context_mm[next_mmu_context];
-	flush_tlb_mm(mm);
-	destroy_context(mm);
-}
+ again:
+	max = LAST_CONTEXT - FIRST_CONTEXT;
 
+	/* Attempt to free next_context first and then loop until we manage */
+	while (max--) {
+		/* Pick up the victim mm */
+		mm = context_mm[id];
 
-/*
- * Get a new mmu context for the address space described by `mm'.
+		/* We have a candidate victim, check if it's active, on SMP
+		 * we cannot steal active contexts
+		 */
+		if (mm->context.active) {
+			id++;
+			if (id > LAST_CONTEXT)
+				id = FIRST_CONTEXT;
+			continue;
+		}
+		pr_debug("[%d] steal context %d from mm @%p\n",
+			 smp_processor_id(), id, mm);
+
+		/* Mark this mm has having no context anymore */
+		mm->context.id = MMU_NO_CONTEXT;
+
+		/* Mark it stale on all CPUs that used this mm */
+		for_each_cpu_mask_nr(cpu, mm->cpu_vm_mask)
+			__set_bit(id, stale_map[cpu]);
+		return id;
+	}
+
+	/* This will happen if you have more CPUs than available contexts,
+	 * all we can do here is wait a bit and try again
+	 */
+	spin_unlock(&context_lock);
+	cpu_relax();
+	spin_lock(&context_lock);
+	goto again;
+}
+#endif  /* CONFIG_SMP */
+
+/* Note that this will also be called on SMP if all other CPUs are
+ * offlined, which means that it may be called for cpu != 0. For
+ * this to work, we somewhat assume that CPUs that are onlined
+ * come up with a fully clean TLB (or are cleaned when offlined)
  */
-static inline void get_mmu_context(struct mm_struct *mm)
+static unsigned int steal_context_up(unsigned int id)
 {
-	unsigned long ctx;
+	struct mm_struct *mm;
+	int cpu = smp_processor_id();
 
-	if (mm->context.id != NO_CONTEXT)
-		return;
+	/* Pick up the victim mm */
+	mm = context_mm[id];
+
+	pr_debug("[%d] steal context %d from mm @%p\n", cpu, id, mm);
 
-	while (atomic_dec_if_positive(&nr_free_contexts) < 0)
-		steal_context();
+	/* Mark this mm has having no context anymore */
+	mm->context.id = MMU_NO_CONTEXT;
 
-	ctx = next_mmu_context;
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
+	/* Flush the TLB for that context */
+	local_flush_tlb_mm(mm);
+
+	/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+	__clear_bit(id, stale_map[cpu]);
+
+	return id;
+}
+
+#ifdef DEBUG_MAP_CONSISTENCY
+static void context_check_map(void)
+{
+	unsigned int id, nrf, nact;
+
+	nrf = nact = 0;
+	for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
+		int used = test_bit(id, context_map);
+		if (!used)
+			nrf++;
+		if (used != (context_mm[id] != NULL))
+			pr_err("MMU: Context %d is %s and MM is %p !\n",
+			       id, used ? "used" : "free", context_mm[id]);
+		if (context_mm[id] != NULL)
+			nact += context_mm[id]->context.active;
 	}
-	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
-	mm->context.id = ctx;
-	context_mm[ctx] = mm;
+	if (nrf != nr_free_contexts) {
+		pr_err("MMU: Free context count out of sync ! (%d vs %d)\n",
+		       nr_free_contexts, nrf);
+		nr_free_contexts = nrf;
+	}
+	if (nact > num_online_cpus())
+		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
+		       nact, num_online_cpus());
 }
+#else
+static void context_check_map(void) { }
+#endif
 
 void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 {
-	get_mmu_context(next);
+	unsigned int id, cpu = smp_processor_id();
+	unsigned long *map;
 
-	set_context(next->context.id, next->pgd);
+	/* No lockless fast path .. yet */
+	spin_lock(&context_lock);
+
+#ifndef DEBUG_STEAL_ONLY
+	pr_debug("[%d] activating context for mm @%p, active=%d, id=%d\n",
+		 cpu, next, next->context.active, next->context.id);
+#endif
+
+#ifdef CONFIG_SMP
+	/* Mark us active and the previous one not anymore */
+	next->context.active++;
+	if (prev) {
+		WARN_ON(prev->context.active < 1);
+		prev->context.active--;
+	}
+#endif /* CONFIG_SMP */
+
+	/* If we already have a valid assigned context, skip all that */
+	id = next->context.id;
+	if (likely(id != MMU_NO_CONTEXT))
+		goto ctxt_ok;
+
+	/* We really don't have a context, let's try to acquire one */
+	id = next_context;
+	if (id > LAST_CONTEXT)
+		id = FIRST_CONTEXT;
+	map = context_map;
+
+	/* No more free contexts, let's try to steal one */
+	if (nr_free_contexts == 0) {
+#ifdef CONFIG_SMP
+		if (num_online_cpus() > 1) {
+			id = steal_context_smp(id);
+			goto stolen;
+		}
+#endif /* CONFIG_SMP */
+		id = steal_context_up(id);
+		goto stolen;
+	}
+	nr_free_contexts--;
+
+	/* We know there's at least one free context, try to find it */
+	while (__test_and_set_bit(id, map)) {
+		id = find_next_zero_bit(map, LAST_CONTEXT+1, id);
+		if (id > LAST_CONTEXT)
+			id = FIRST_CONTEXT;
+	}
+ stolen:
+	next_context = id + 1;
+	context_mm[id] = next;
+	next->context.id = id;
+
+#ifndef DEBUG_STEAL_ONLY
+	pr_debug("[%d] picked up new id %d, nrf is now %d\n",
+		 cpu, id, nr_free_contexts);
+#endif
+
+	context_check_map();
+ ctxt_ok:
+
+	/* If that context got marked stale on this CPU, then flush the
+	 * local TLB for it and unmark it before we use it
+	 */
+	if (test_bit(id, stale_map[cpu])) {
+		pr_debug("[%d] flushing stale context %d for mm @%p !\n",
+			 cpu, id, next);
+		local_flush_tlb_mm(next);
+
+		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+		__clear_bit(id, stale_map[cpu]);
+	}
+
+	/* Flick the MMU and release lock */
+	set_context(id, next->pgd);
+	spin_unlock(&context_lock);
 }
 
 /*
@@ -125,7 +279,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
  */
 int init_new_context(struct task_struct *t, struct mm_struct *mm)
 {
-	mm->context.id = NO_CONTEXT;
+	mm->context.id = MMU_NO_CONTEXT;
+	mm->context.active = 0;
+
 	return 0;
 }
 
@@ -134,13 +290,25 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
  */
 void destroy_context(struct mm_struct *mm)
 {
-	preempt_disable();
-	if (mm->context.id != NO_CONTEXT) {
-		clear_bit(mm->context.id, context_map);
-		mm->context.id = NO_CONTEXT;
-		atomic_inc(&nr_free_contexts);
+	unsigned int id;
+
+	if (mm->context.id == MMU_NO_CONTEXT)
+		return;
+
+	WARN_ON(mm->context.active != 0);
+
+	spin_lock(&context_lock);
+	id = mm->context.id;
+	if (id != MMU_NO_CONTEXT) {
+		__clear_bit(id, context_map);
+		mm->context.id = MMU_NO_CONTEXT;
+#ifdef DEBUG_MAP_CONSISTENCY
+		mm->context.active = 0;
+		context_mm[id] = NULL;
+#endif
+		nr_free_contexts++;
 	}
-	preempt_enable();
+	spin_unlock(&context_lock);
 }
 
 
@@ -149,6 +317,12 @@ void destroy_context(struct mm_struct *mm)
  */
 void __init mmu_context_init(void)
 {
+	/* Mark init_mm as being active on all possible CPUs since
+	 * we'll get called with prev == init_mm the first time
+	 * we schedule on a given CPU
+	 */
+	init_mm.context.active = NR_CPUS;
+
 	/*
 	 * Some processors have too few contexts to reserve one for
 	 * init_mm, and require using context 0 for a normal task.
@@ -156,7 +330,7 @@ void __init mmu_context_init(void)
 	 * This code assumes FIRST_CONTEXT < 32.
 	 */
 	context_map[0] = (1 << FIRST_CONTEXT) - 1;
-	next_mmu_context = FIRST_CONTEXT;
-	atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
+	next_context = FIRST_CONTEXT;
+	nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
 }
 
-- 
cgit v1.2.3


From 7c03d653cd257793dc40520c94e229b5fd0578e7 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:32 +0000
Subject: powerpc/mm: Introduce MMU features

We're soon running out of CPU features and I need to add some new
ones for various MMU related bits, so this patch separates the MMU
features from the CPU features.  I moved over the 32-bit MMU related
ones, added base features for MMU type families, but didn't move
over any 64-bit only feature yet.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/cputable.h       |  85 ++++++++++------------
 arch/powerpc/include/asm/feature-fixups.h |  30 ++++++++
 arch/powerpc/include/asm/mmu.h            |  41 +++++++++++
 arch/powerpc/kernel/cputable.c            | 113 ++++++++++++++++++++++++++++++
 arch/powerpc/kernel/head_32.S             |   8 +--
 arch/powerpc/kernel/head_fsl_booke.S      |   4 +-
 arch/powerpc/kernel/module.c              |   6 ++
 arch/powerpc/kernel/setup_32.c            |   4 ++
 arch/powerpc/kernel/setup_64.c            |   2 +
 arch/powerpc/kernel/swsusp_32.S           |   6 +-
 arch/powerpc/kernel/vdso.c                |  10 +++
 arch/powerpc/kernel/vdso32/vdso32.lds.S   |   3 +
 arch/powerpc/kernel/vdso64/vdso64.lds.S   |   3 +
 arch/powerpc/kernel/vmlinux.lds.S         |   6 ++
 arch/powerpc/mm/ppc_mmu_32.c              |   2 +-
 arch/powerpc/platforms/powermac/sleep.S   |   5 +-
 16 files changed, 268 insertions(+), 60 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 31888322d76a..4911104791c3 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -82,6 +82,7 @@ struct cpu_spec {
 	char		*cpu_name;
 	unsigned long	cpu_features;		/* Kernel features */
 	unsigned int	cpu_user_features;	/* Userland features */
+	unsigned int	mmu_features;		/* MMU features */
 
 	/* cache line sizes */
 	unsigned int	icache_bsize;
@@ -144,17 +145,14 @@ extern const char *powerpc_base_platform;
 #define CPU_FTR_USE_TB			ASM_CONST(0x0000000000000040)
 #define CPU_FTR_L2CSR			ASM_CONST(0x0000000000000080)
 #define CPU_FTR_601			ASM_CONST(0x0000000000000100)
-#define CPU_FTR_HPTE_TABLE		ASM_CONST(0x0000000000000200)
 #define CPU_FTR_CAN_NAP			ASM_CONST(0x0000000000000400)
 #define CPU_FTR_L3CR			ASM_CONST(0x0000000000000800)
 #define CPU_FTR_L3_DISABLE_NAP		ASM_CONST(0x0000000000001000)
 #define CPU_FTR_NAP_DISABLE_L2_PR	ASM_CONST(0x0000000000002000)
 #define CPU_FTR_DUAL_PLL_750FX		ASM_CONST(0x0000000000004000)
 #define CPU_FTR_NO_DPM			ASM_CONST(0x0000000000008000)
-#define CPU_FTR_HAS_HIGH_BATS		ASM_CONST(0x0000000000010000)
 #define CPU_FTR_NEED_COHERENT		ASM_CONST(0x0000000000020000)
 #define CPU_FTR_NO_BTIC			ASM_CONST(0x0000000000040000)
-#define CPU_FTR_BIG_PHYS		ASM_CONST(0x0000000000080000)
 #define CPU_FTR_NODSISRALIGN		ASM_CONST(0x0000000000100000)
 #define CPU_FTR_PPC_LE			ASM_CONST(0x0000000000200000)
 #define CPU_FTR_REAL_LE			ASM_CONST(0x0000000000400000)
@@ -266,107 +264,99 @@ extern const char *powerpc_base_platform;
 		     !defined(CONFIG_POWER3) && !defined(CONFIG_POWER4) && \
 		     !defined(CONFIG_BOOKE))
 
-#define CPU_FTRS_PPC601	(CPU_FTR_COMMON | CPU_FTR_601 | CPU_FTR_HPTE_TABLE | \
+#define CPU_FTRS_PPC601	(CPU_FTR_COMMON | CPU_FTR_601 | \
 	CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE)
 #define CPU_FTRS_603	(CPU_FTR_COMMON | \
 	    CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE)
 #define CPU_FTRS_604	(CPU_FTR_COMMON | \
-	    CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | CPU_FTR_PPC_LE)
+	    CPU_FTR_USE_TB | CPU_FTR_PPC_LE)
 #define CPU_FTRS_740_NOTAU	(CPU_FTR_COMMON | \
 	    CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | CPU_FTR_L2CR | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE)
+	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE)
 #define CPU_FTRS_740	(CPU_FTR_COMMON | \
 	    CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | CPU_FTR_L2CR | \
-	    CPU_FTR_TAU | CPU_FTR_HPTE_TABLE | CPU_FTR_MAYBE_CAN_NAP | \
+	    CPU_FTR_TAU | CPU_FTR_MAYBE_CAN_NAP | \
 	    CPU_FTR_PPC_LE)
 #define CPU_FTRS_750	(CPU_FTR_COMMON | \
 	    CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | CPU_FTR_L2CR | \
-	    CPU_FTR_TAU | CPU_FTR_HPTE_TABLE | CPU_FTR_MAYBE_CAN_NAP | \
+	    CPU_FTR_TAU | CPU_FTR_MAYBE_CAN_NAP | \
 	    CPU_FTR_PPC_LE)
-#define CPU_FTRS_750CL	(CPU_FTRS_750 | CPU_FTR_HAS_HIGH_BATS)
+#define CPU_FTRS_750CL	(CPU_FTRS_750)
 #define CPU_FTRS_750FX1	(CPU_FTRS_750 | CPU_FTR_DUAL_PLL_750FX | CPU_FTR_NO_DPM)
 #define CPU_FTRS_750FX2	(CPU_FTRS_750 | CPU_FTR_NO_DPM)
-#define CPU_FTRS_750FX	(CPU_FTRS_750 | CPU_FTR_DUAL_PLL_750FX | \
-		CPU_FTR_HAS_HIGH_BATS)
+#define CPU_FTRS_750FX	(CPU_FTRS_750 | CPU_FTR_DUAL_PLL_750FX)
 #define CPU_FTRS_750GX	(CPU_FTRS_750FX)
 #define CPU_FTRS_7400_NOTAU	(CPU_FTR_COMMON | \
 	    CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | CPU_FTR_L2CR | \
-	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_HPTE_TABLE | \
+	    CPU_FTR_ALTIVEC_COMP | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE)
 #define CPU_FTRS_7400	(CPU_FTR_COMMON | \
 	    CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | CPU_FTR_L2CR | \
-	    CPU_FTR_TAU | CPU_FTR_ALTIVEC_COMP | CPU_FTR_HPTE_TABLE | \
+	    CPU_FTR_TAU | CPU_FTR_ALTIVEC_COMP | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE)
 #define CPU_FTRS_7450_20	(CPU_FTR_COMMON | \
 	    CPU_FTR_USE_TB | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
-	    CPU_FTR_L3CR | CPU_FTR_HPTE_TABLE | CPU_FTR_SPEC7450 | \
+	    CPU_FTR_L3CR | CPU_FTR_SPEC7450 | \
 	    CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
 #define CPU_FTRS_7450_21	(CPU_FTR_COMMON | \
 	    CPU_FTR_USE_TB | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
-	    CPU_FTR_L3CR | CPU_FTR_HPTE_TABLE | CPU_FTR_SPEC7450 | \
+	    CPU_FTR_L3CR | CPU_FTR_SPEC7450 | \
 	    CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_L3_DISABLE_NAP | \
 	    CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
 #define CPU_FTRS_7450_23	(CPU_FTR_COMMON | \
 	    CPU_FTR_USE_TB | CPU_FTR_NEED_PAIRED_STWCX | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
-	    CPU_FTR_L3CR | CPU_FTR_HPTE_TABLE | CPU_FTR_SPEC7450 | \
+	    CPU_FTR_L3CR | CPU_FTR_SPEC7450 | \
 	    CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE)
 #define CPU_FTRS_7455_1	(CPU_FTR_COMMON | \
 	    CPU_FTR_USE_TB | CPU_FTR_NEED_PAIRED_STWCX | \
 	    CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | CPU_FTR_L3CR | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_SPEC7450 | CPU_FTR_HAS_HIGH_BATS | \
-	    CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE)
+	    CPU_FTR_SPEC7450 | CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE)
 #define CPU_FTRS_7455_20	(CPU_FTR_COMMON | \
 	    CPU_FTR_USE_TB | CPU_FTR_NEED_PAIRED_STWCX | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
-	    CPU_FTR_L3CR | CPU_FTR_HPTE_TABLE | CPU_FTR_SPEC7450 | \
+	    CPU_FTR_L3CR | CPU_FTR_SPEC7450 | \
 	    CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_L3_DISABLE_NAP | \
-	    CPU_FTR_NEED_COHERENT | CPU_FTR_HAS_HIGH_BATS | CPU_FTR_PPC_LE)
+	    CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE)
 #define CPU_FTRS_7455	(CPU_FTR_COMMON | \
 	    CPU_FTR_USE_TB | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
-	    CPU_FTR_L3CR | CPU_FTR_HPTE_TABLE | CPU_FTR_SPEC7450 | \
-	    CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_HAS_HIGH_BATS | \
+	    CPU_FTR_L3CR | CPU_FTR_SPEC7450 | CPU_FTR_NAP_DISABLE_L2_PR | \
 	    CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
 #define CPU_FTRS_7447_10	(CPU_FTR_COMMON | \
 	    CPU_FTR_USE_TB | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
-	    CPU_FTR_L3CR | CPU_FTR_HPTE_TABLE | CPU_FTR_SPEC7450 | \
-	    CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_HAS_HIGH_BATS | \
+	    CPU_FTR_L3CR | CPU_FTR_SPEC7450 | CPU_FTR_NAP_DISABLE_L2_PR | \
 	    CPU_FTR_NEED_COHERENT | CPU_FTR_NO_BTIC | CPU_FTR_PPC_LE | \
 	    CPU_FTR_NEED_PAIRED_STWCX)
 #define CPU_FTRS_7447	(CPU_FTR_COMMON | \
 	    CPU_FTR_USE_TB | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
-	    CPU_FTR_L3CR | CPU_FTR_HPTE_TABLE | CPU_FTR_SPEC7450 | \
-	    CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_HAS_HIGH_BATS | \
+	    CPU_FTR_L3CR | CPU_FTR_SPEC7450 | CPU_FTR_NAP_DISABLE_L2_PR | \
 	    CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
 #define CPU_FTRS_7447A	(CPU_FTR_COMMON | \
 	    CPU_FTR_USE_TB | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_SPEC7450 | \
-	    CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_HAS_HIGH_BATS | \
+	    CPU_FTR_SPEC7450 | CPU_FTR_NAP_DISABLE_L2_PR | \
 	    CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
 #define CPU_FTRS_7448	(CPU_FTR_COMMON | \
 	    CPU_FTR_USE_TB | \
 	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_SPEC7450 | \
-	    CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_HAS_HIGH_BATS | \
+	    CPU_FTR_SPEC7450 | CPU_FTR_NAP_DISABLE_L2_PR | \
 	    CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
 #define CPU_FTRS_82XX	(CPU_FTR_COMMON | \
 	    CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB)
 #define CPU_FTRS_G2_LE	(CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \
-	    CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_HAS_HIGH_BATS)
+	    CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP)
 #define CPU_FTRS_E300	(CPU_FTR_MAYBE_CAN_DOZE | \
-	    CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_HAS_HIGH_BATS | \
+	    CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP | \
 	    CPU_FTR_COMMON)
 #define CPU_FTRS_E300C2	(CPU_FTR_MAYBE_CAN_DOZE | \
-	    CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_HAS_HIGH_BATS | \
+	    CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP | \
 	    CPU_FTR_COMMON | CPU_FTR_FPU_UNAVAILABLE)
-#define CPU_FTRS_CLASSIC32	(CPU_FTR_COMMON | \
-	    CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE)
+#define CPU_FTRS_CLASSIC32	(CPU_FTR_COMMON | CPU_FTR_USE_TB)
 #define CPU_FTRS_8XX	(CPU_FTR_USE_TB)
 #define CPU_FTRS_40X	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_44X	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
@@ -379,55 +369,54 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_E500_2	(CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | \
-	    CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_BIG_PHYS | \
+	    CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | \
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_E500MC	(CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | \
-	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_BIG_PHYS | CPU_FTR_NODSISRALIGN | \
+	    CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_GENERIC_32	(CPU_FTR_COMMON | CPU_FTR_NODSISRALIGN)
 
 /* 64-bit CPUs */
 #define CPU_FTRS_POWER3	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_IABR | CPU_FTR_PPC_LE)
+	    CPU_FTR_IABR | CPU_FTR_PPC_LE)
 #define CPU_FTRS_RS64	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_IABR | \
+	    CPU_FTR_IABR | \
 	    CPU_FTR_MMCRA | CPU_FTR_CTRL)
 #define CPU_FTRS_POWER4	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ)
 #define CPU_FTRS_PPC970	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \
 	    CPU_FTR_CP_USE_DCBTZ)
 #define CPU_FTRS_POWER5	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
 	    CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
 	    CPU_FTR_PURR)
 #define CPU_FTRS_POWER6 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
 	    CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
 	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
 	    CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD)
 #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
 	    CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
 	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
 	    CPU_FTR_DSCR | CPU_FTR_SAO)
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
 	    CPU_FTR_PAUSE_ZERO | CPU_FTR_CI_LARGE_PAGE | \
 	    CPU_FTR_CELL_TB_BUG | CPU_FTR_CP_USE_DCBTZ | \
 	    CPU_FTR_UNALIGNED_LD_STD)
 #define CPU_FTRS_PA6T (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | \
+	    CPU_FTR_PPCAS_ARCH_V2 | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_CI_LARGE_PAGE | \
 	    CPU_FTR_PURR | CPU_FTR_REAL_LE | CPU_FTR_NO_SLBIE_B)
-#define CPU_FTRS_COMPATIBLE	(CPU_FTR_USE_TB | \
-	    CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2)
+#define CPU_FTRS_COMPATIBLE	(CPU_FTR_USE_TB | CPU_FTR_PPCAS_ARCH_V2)
 
 #ifdef __powerpc64__
 #define CPU_FTRS_POSSIBLE	\
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index a1029967620b..e4094a5cb05b 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -81,6 +81,36 @@ label##5:					       	\
 #define ALT_FTR_SECTION_END_IFCLR(msk)	\
 	ALT_FTR_SECTION_END_NESTED_IFCLR(msk, 97)
 
+/* MMU feature dependent sections */
+#define BEGIN_MMU_FTR_SECTION_NESTED(label)	START_FTR_SECTION(label)
+#define BEGIN_MMU_FTR_SECTION			START_FTR_SECTION(97)
+
+#define END_MMU_FTR_SECTION_NESTED(msk, val, label) 		\
+	FTR_SECTION_ELSE_NESTED(label)				\
+	MAKE_FTR_SECTION_ENTRY(msk, val, label, __mmu_ftr_fixup)
+
+#define END_MMU_FTR_SECTION(msk, val)		\
+	END_MMU_FTR_SECTION_NESTED(msk, val, 97)
+
+#define END_MMU_FTR_SECTION_IFSET(msk)	END_MMU_FTR_SECTION((msk), (msk))
+#define END_MMU_FTR_SECTION_IFCLR(msk)	END_MMU_FTR_SECTION((msk), 0)
+
+/* MMU feature sections with alternatives, use BEGIN_FTR_SECTION to start */
+#define MMU_FTR_SECTION_ELSE_NESTED(label)	FTR_SECTION_ELSE_NESTED(label)
+#define MMU_FTR_SECTION_ELSE	MMU_FTR_SECTION_ELSE_NESTED(97)
+#define ALT_MMU_FTR_SECTION_END_NESTED(msk, val, label)	\
+	MAKE_FTR_SECTION_ENTRY(msk, val, label, __mmu_ftr_fixup)
+#define ALT_MMU_FTR_SECTION_END_NESTED_IFSET(msk, label)	\
+	ALT_MMU_FTR_SECTION_END_NESTED(msk, msk, label)
+#define ALT_MMU_FTR_SECTION_END_NESTED_IFCLR(msk, label)	\
+	ALT_MMU_FTR_SECTION_END_NESTED(msk, 0, label)
+#define ALT_MMU_FTR_SECTION_END(msk, val)	\
+	ALT_MMU_FTR_SECTION_END_NESTED(msk, val, 97)
+#define ALT_MMU_FTR_SECTION_END_IFSET(msk)	\
+	ALT_MMU_FTR_SECTION_END_NESTED_IFSET(msk, 97)
+#define ALT_MMU_FTR_SECTION_END_IFCLR(msk)	\
+	ALT_MMU_FTR_SECTION_END_NESTED_IFCLR(msk, 97)
+
 /* Firmware feature dependent sections */
 #define BEGIN_FW_FTR_SECTION_NESTED(label)	START_FTR_SECTION(label)
 #define BEGIN_FW_FTR_SECTION			START_FTR_SECTION(97)
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 4c0e1b4f975c..dc8c0aef5e6c 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -2,6 +2,47 @@
 #define _ASM_POWERPC_MMU_H_
 #ifdef __KERNEL__
 
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+/*
+ * MMU features bit definitions
+ */
+
+/*
+ * First half is MMU families
+ */
+#define MMU_FTR_HPTE_TABLE		ASM_CONST(0x00000001)
+#define MMU_FTR_TYPE_8xx		ASM_CONST(0x00000002)
+#define MMU_FTR_TYPE_40x		ASM_CONST(0x00000004)
+#define MMU_FTR_TYPE_44x		ASM_CONST(0x00000008)
+#define MMU_FTR_TYPE_FSL_E		ASM_CONST(0x00000010)
+
+/*
+ * This is individual features
+ */
+
+/* Enable use of high BAT registers */
+#define MMU_FTR_USE_HIGH_BATS		ASM_CONST(0x00010000)
+
+/* Enable >32-bit physical addresses on 32-bit processor, only used
+ * by CONFIG_6xx currently as BookE supports that from day 1
+ */
+#define MMU_FTR_BIG_PHYS		ASM_CONST(0x00020000)
+
+#ifndef __ASSEMBLY__
+#include <asm/cputable.h>
+
+static inline int mmu_has_feature(unsigned long feature)
+{
+	return (cur_cpu_spec->mmu_features & feature);
+}
+
+extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup;
+
+#endif /* !__ASSEMBLY__ */
+
+
 #ifdef CONFIG_PPC64
 /* 64-bit classic hash table MMU */
 #  include <asm/mmu-hash64.h>
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 921a2298d8e3..923f87aff20a 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -19,6 +19,7 @@
 #include <asm/oprofile_impl.h>
 #include <asm/cputable.h>
 #include <asm/prom.h>		/* for PTRRELOC on ARCH=ppc */
+#include <asm/mmu.h>
 
 struct cpu_spec* cur_cpu_spec = NULL;
 EXPORT_SYMBOL(cur_cpu_spec);
@@ -94,6 +95,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER3 (630)",
 		.cpu_features		= CPU_FTRS_POWER3,
 		.cpu_user_features	= COMMON_USER_PPC64|PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -109,6 +111,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER3 (630+)",
 		.cpu_features		= CPU_FTRS_POWER3,
 		.cpu_user_features	= COMMON_USER_PPC64|PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -124,6 +127,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "RS64-II (northstar)",
 		.cpu_features		= CPU_FTRS_RS64,
 		.cpu_user_features	= COMMON_USER_PPC64,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -139,6 +143,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "RS64-III (pulsar)",
 		.cpu_features		= CPU_FTRS_RS64,
 		.cpu_user_features	= COMMON_USER_PPC64,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -154,6 +159,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "RS64-III (icestar)",
 		.cpu_features		= CPU_FTRS_RS64,
 		.cpu_user_features	= COMMON_USER_PPC64,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -169,6 +175,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "RS64-IV (sstar)",
 		.cpu_features		= CPU_FTRS_RS64,
 		.cpu_user_features	= COMMON_USER_PPC64,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -184,6 +191,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER4 (gp)",
 		.cpu_features		= CPU_FTRS_POWER4,
 		.cpu_user_features	= COMMON_USER_POWER4,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -199,6 +207,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER4+ (gq)",
 		.cpu_features		= CPU_FTRS_POWER4,
 		.cpu_user_features	= COMMON_USER_POWER4,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -215,6 +224,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_PPC970,
 		.cpu_user_features	= COMMON_USER_POWER4 |
 			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -233,6 +243,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_PPC970,
 		.cpu_user_features	= COMMON_USER_POWER4 |
 			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -251,6 +262,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_PPC970,
 		.cpu_user_features	= COMMON_USER_POWER4 |
 			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -269,6 +281,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_PPC970,
 		.cpu_user_features	= COMMON_USER_POWER4 |
 			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -287,6 +300,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_PPC970,
 		.cpu_user_features	= COMMON_USER_POWER4 |
 			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 8,
@@ -303,6 +317,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER5 (gr)",
 		.cpu_features		= CPU_FTRS_POWER5,
 		.cpu_user_features	= COMMON_USER_POWER5,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 6,
@@ -323,6 +338,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER5+ (gs)",
 		.cpu_features		= CPU_FTRS_POWER5,
 		.cpu_user_features	= COMMON_USER_POWER5_PLUS,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 6,
@@ -339,6 +355,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER5+ (gs)",
 		.cpu_features		= CPU_FTRS_POWER5,
 		.cpu_user_features	= COMMON_USER_POWER5_PLUS,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 6,
@@ -356,6 +373,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER5+",
 		.cpu_features		= CPU_FTRS_POWER5,
 		.cpu_user_features	= COMMON_USER_POWER5_PLUS,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.machine_check		= machine_check_generic,
@@ -369,6 +387,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_POWER6,
 		.cpu_user_features	= COMMON_USER_POWER6 |
 			PPC_FEATURE_POWER6_EXT,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 6,
@@ -388,6 +407,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER6 (architected)",
 		.cpu_features		= CPU_FTRS_POWER6,
 		.cpu_user_features	= COMMON_USER_POWER6,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.machine_check		= machine_check_generic,
@@ -400,6 +420,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER7 (architected)",
 		.cpu_features		= CPU_FTRS_POWER7,
 		.cpu_user_features	= COMMON_USER_POWER7,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.machine_check		= machine_check_generic,
@@ -412,6 +433,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER7 (raw)",
 		.cpu_features		= CPU_FTRS_POWER7,
 		.cpu_user_features	= COMMON_USER_POWER7,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 6,
@@ -434,6 +456,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_user_features	= COMMON_USER_PPC64 |
 			PPC_FEATURE_CELL | PPC_FEATURE_HAS_ALTIVEC_COMP |
 			PPC_FEATURE_SMT,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 4,
@@ -449,6 +472,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "PA6T",
 		.cpu_features		= CPU_FTRS_PA6T,
 		.cpu_user_features	= COMMON_USER_PA6T,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 64,
 		.dcache_bsize		= 64,
 		.num_pmcs		= 6,
@@ -466,6 +490,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER4 (compatible)",
 		.cpu_features		= CPU_FTRS_COMPATIBLE,
 		.cpu_user_features	= COMMON_USER_PPC64,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.num_pmcs		= 6,
@@ -483,6 +508,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_PPC601,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_601_INSTR |
 			PPC_FEATURE_UNIFIED_CACHE | PPC_FEATURE_NO_TB,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_generic,
@@ -494,6 +520,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "603",
 		.cpu_features		= CPU_FTRS_603,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= 0,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_603,
@@ -506,6 +533,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "603e",
 		.cpu_features		= CPU_FTRS_603,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= 0,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_603,
@@ -518,6 +546,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "603ev",
 		.cpu_features		= CPU_FTRS_603,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= 0,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_603,
@@ -530,6 +559,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "604",
 		.cpu_features		= CPU_FTRS_604,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 2,
@@ -543,6 +573,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "604e",
 		.cpu_features		= CPU_FTRS_604,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -556,6 +587,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "604r",
 		.cpu_features		= CPU_FTRS_604,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -569,6 +601,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "604ev",
 		.cpu_features		= CPU_FTRS_604,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -582,6 +615,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "740/750",
 		.cpu_features		= CPU_FTRS_740_NOTAU,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -595,6 +629,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "750CX",
 		.cpu_features		= CPU_FTRS_750,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -608,6 +643,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "750CX",
 		.cpu_features		= CPU_FTRS_750,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -622,6 +658,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "750CXe",
 		.cpu_features		= CPU_FTRS_750,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -636,6 +673,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "750CXe",
 		.cpu_features		= CPU_FTRS_750,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -650,6 +688,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "750CL",
 		.cpu_features		= CPU_FTRS_750CL,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -664,6 +703,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "745/755",
 		.cpu_features		= CPU_FTRS_750,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -678,6 +718,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "750FX",
 		.cpu_features		= CPU_FTRS_750FX1,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -692,6 +733,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "750FX",
 		.cpu_features		= CPU_FTRS_750FX2,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -706,6 +748,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "750FX",
 		.cpu_features		= CPU_FTRS_750FX,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -720,6 +763,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "750GX",
 		.cpu_features		= CPU_FTRS_750GX,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -734,6 +778,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "740/750",
 		.cpu_features		= CPU_FTRS_740,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -749,6 +794,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7400_NOTAU,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -764,6 +810,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7400,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -779,6 +826,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7400,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -794,6 +842,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7450_20,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -811,6 +860,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7450_21,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -828,6 +878,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7450_23,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -845,6 +896,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7455_1,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -862,6 +914,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7455_20,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -879,6 +932,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7455,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -896,6 +950,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7447_10,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -913,6 +968,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7447_10,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -929,6 +985,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "7447/7457",
 		.cpu_features		= CPU_FTRS_7447,
 		.cpu_user_features	= COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -946,6 +1003,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7447A,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -963,6 +1021,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_7448,
 		.cpu_user_features	= COMMON_USER |
 			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 6,
@@ -979,6 +1038,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "82xx",
 		.cpu_features		= CPU_FTRS_82XX,
 		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= 0,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_603,
@@ -991,6 +1051,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "G2_LE",
 		.cpu_features		= CPU_FTRS_G2_LE,
 		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_603,
@@ -1003,6 +1064,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "e300c1",
 		.cpu_features		= CPU_FTRS_E300,
 		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_603,
@@ -1015,6 +1077,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "e300c2",
 		.cpu_features		= CPU_FTRS_E300C2,
 		.cpu_user_features	= PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU,
+		.mmu_features		= MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_603,
@@ -1027,6 +1090,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "e300c3",
 		.cpu_features		= CPU_FTRS_E300,
 		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_603,
@@ -1041,6 +1105,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "e300c4",
 		.cpu_features		= CPU_FTRS_E300,
 		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= MMU_FTR_USE_HIGH_BATS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_603,
@@ -1056,6 +1121,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "(generic PPC)",
 		.cpu_features		= CPU_FTRS_CLASSIC32,
 		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_generic,
@@ -1071,6 +1137,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		 * if the 8xx code is there.... */
 		.cpu_features		= CPU_FTRS_8XX,
 		.cpu_user_features	= PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU,
+		.mmu_features		= MMU_FTR_TYPE_8xx,
 		.icache_bsize		= 16,
 		.dcache_bsize		= 16,
 		.platform		= "ppc823",
@@ -1083,6 +1150,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "403GC",
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 16,
 		.dcache_bsize		= 16,
 		.machine_check		= machine_check_4xx,
@@ -1095,6 +1163,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 		 	PPC_FEATURE_HAS_MMU | PPC_FEATURE_NO_TB,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 16,
 		.dcache_bsize		= 16,
 		.machine_check		= machine_check_4xx,
@@ -1106,6 +1175,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "403G ??",
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 16,
 		.dcache_bsize		= 16,
 		.machine_check		= machine_check_4xx,
@@ -1118,6 +1188,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1130,6 +1201,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1142,6 +1214,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1154,6 +1227,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1166,6 +1240,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1178,6 +1253,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1190,6 +1266,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1202,6 +1279,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1213,6 +1291,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "405LP",
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1225,6 +1304,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1237,6 +1317,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1249,6 +1330,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1261,6 +1343,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1273,6 +1356,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1286,6 +1370,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1298,6 +1383,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_40X,
 		.cpu_user_features	= PPC_FEATURE_32 |
 			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1312,6 +1398,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440GR Rev. A",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1323,6 +1410,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440EP Rev. A",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440ep,
@@ -1335,6 +1423,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440GR Rev. B",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1346,6 +1435,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440EP Rev. C",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440ep,
@@ -1358,6 +1448,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440EP Rev. B",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440ep,
@@ -1370,6 +1461,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440GRX",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440grx,
@@ -1382,6 +1474,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440EPX",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440epx,
@@ -1394,6 +1487,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440GP Rev. B",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1405,6 +1499,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440GP Rev. C",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1416,6 +1511,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440GX Rev. A",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440gx,
@@ -1428,6 +1524,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440GX Rev. B",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440gx,
@@ -1440,6 +1537,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440GX Rev. C",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440gx,
@@ -1452,6 +1550,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440GX Rev. F",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440gx,
@@ -1464,6 +1563,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440SP Rev. A",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1475,6 +1575,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name               = "440SPe Rev. A",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features      = COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize           = 32,
 		.dcache_bsize           = 32,
 		.cpu_setup		= __setup_cpu_440spe,
@@ -1487,6 +1588,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440SPe Rev. B",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440spe,
@@ -1499,6 +1601,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "440 in Virtex-5 FXT",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_440x5,
@@ -1511,6 +1614,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "460EX",
 		.cpu_features		= CPU_FTRS_440x6,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_460ex,
@@ -1523,6 +1627,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "460GT",
 		.cpu_features		= CPU_FTRS_440x6,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.cpu_setup		= __setup_cpu_460gt,
@@ -1535,6 +1640,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "(generic 44x PPC)",
 		.cpu_features		= CPU_FTRS_44X,
 		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_4xx,
@@ -1551,6 +1657,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_user_features	= COMMON_USER_BOOKE |
 			PPC_FEATURE_HAS_EFP_SINGLE |
 			PPC_FEATURE_UNIFIED_CACHE,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_e200,
 		.platform		= "ppc5554",
@@ -1565,6 +1672,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 			PPC_FEATURE_HAS_SPE_COMP |
 			PPC_FEATURE_HAS_EFP_SINGLE_COMP |
 			PPC_FEATURE_UNIFIED_CACHE,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_e200,
 		.platform		= "ppc5554",
@@ -1577,6 +1685,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_user_features	= COMMON_USER_BOOKE |
 			PPC_FEATURE_HAS_EFP_SINGLE |
 			PPC_FEATURE_UNIFIED_CACHE,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_e200,
 		.platform		= "ppc5554",
@@ -1591,6 +1700,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_user_features	= COMMON_USER_BOOKE |
 			PPC_FEATURE_HAS_SPE_COMP |
 			PPC_FEATURE_HAS_EFP_SINGLE_COMP,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -1608,6 +1718,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 			PPC_FEATURE_HAS_SPE_COMP |
 			PPC_FEATURE_HAS_EFP_SINGLE_COMP |
 			PPC_FEATURE_HAS_EFP_DOUBLE_COMP,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.num_pmcs		= 4,
@@ -1622,6 +1733,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "e500mc",
 		.cpu_features		= CPU_FTRS_E500MC,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS,
 		.icache_bsize		= 64,
 		.dcache_bsize		= 64,
 		.num_pmcs		= 4,
@@ -1638,6 +1750,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_user_features	= COMMON_USER_BOOKE |
 			PPC_FEATURE_HAS_SPE_COMP |
 			PPC_FEATURE_HAS_EFP_SINGLE_COMP,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
 		.machine_check		= machine_check_e500,
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index be9f9e5470e8..266061924654 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -990,12 +990,12 @@ load_up_mmu:
 	LOAD_BAT(1,r3,r4,r5)
 	LOAD_BAT(2,r3,r4,r5)
 	LOAD_BAT(3,r3,r4,r5)
-BEGIN_FTR_SECTION
+BEGIN_MMU_FTR_SECTION
 	LOAD_BAT(4,r3,r4,r5)
 	LOAD_BAT(5,r3,r4,r5)
 	LOAD_BAT(6,r3,r4,r5)
 	LOAD_BAT(7,r3,r4,r5)
-END_FTR_SECTION_IFSET(CPU_FTR_HAS_HIGH_BATS)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
 	blr
 
 /*
@@ -1141,7 +1141,7 @@ clear_bats:
 	mtspr	SPRN_IBAT2L,r10
 	mtspr	SPRN_IBAT3U,r10
 	mtspr	SPRN_IBAT3L,r10
-BEGIN_FTR_SECTION
+BEGIN_MMU_FTR_SECTION
 	/* Here's a tweak: at this point, CPU setup have
 	 * not been called yet, so HIGH_BAT_EN may not be
 	 * set in HID0 for the 745x processors. However, it
@@ -1164,7 +1164,7 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_IBAT6L,r10
 	mtspr	SPRN_IBAT7U,r10
 	mtspr	SPRN_IBAT7L,r10
-END_FTR_SECTION_IFSET(CPU_FTR_HAS_HIGH_BATS)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
 	blr
 
 flush_tlbs:
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 9a4639c459e6..11b549acc034 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -767,10 +767,10 @@ finish_tlb_load:
 	rlwimi	r12, r13, 24, 0, 7	/* grab RPN[32:39] */
 	rlwimi	r12, r11, 24, 8, 19	/* grab RPN[40:51] */
 	mtspr	SPRN_MAS3, r12
-BEGIN_FTR_SECTION
+BEGIN_MMU_FTR_SECTION
 	srwi	r10, r13, 8		/* grab RPN[8:31] */
 	mtspr	SPRN_MAS7, r10
-END_FTR_SECTION_IFSET(CPU_FTR_BIG_PHYS)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
 #else
 	rlwimi	r11, r12, 0, 20, 31	/* Extract RPN from PTE and merge with perms */
 	mtspr	SPRN_MAS3, r11
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 7ff292475269..43e7e3a7f130 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -78,6 +78,12 @@ int module_finalize(const Elf_Ehdr *hdr,
 				  (void *)sect->sh_addr,
 				  (void *)sect->sh_addr + sect->sh_size);
 
+	sect = find_section(hdr, sechdrs, "__mmu_ftr_fixup");
+	if (sect != NULL)
+		do_feature_fixups(cur_cpu_spec->mmu_features,
+				  (void *)sect->sh_addr,
+				  (void *)sect->sh_addr + sect->sh_size);
+
 #ifdef CONFIG_PPC64
 	sect = find_section(hdr, sechdrs, "__fw_ftr_fixup");
 	if (sect != NULL)
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 086c23c1ee5e..b14c2a3e2185 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -97,6 +97,10 @@ notrace unsigned long __init early_init(unsigned long dt_ptr)
 			  PTRRELOC(&__start___ftr_fixup),
 			  PTRRELOC(&__stop___ftr_fixup));
 
+	do_feature_fixups(spec->mmu_features,
+			  PTRRELOC(&__start___mmu_ftr_fixup),
+			  PTRRELOC(&__stop___mmu_ftr_fixup));
+
 	do_lwsync_fixups(spec->cpu_features,
 			 PTRRELOC(&__start___lwsync_fixup),
 			 PTRRELOC(&__stop___lwsync_fixup));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index ce48f5c5c542..d8bd2161e738 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -361,6 +361,8 @@ void __init setup_system(void)
 	 */
 	do_feature_fixups(cur_cpu_spec->cpu_features,
 			  &__start___ftr_fixup, &__stop___ftr_fixup);
+	do_feature_fixups(cur_cpu_spec->mmu_features,
+			  &__start___mmu_ftr_fixup, &__stop___mmu_ftr_fixup);
 	do_feature_fixups(powerpc_firmware_features,
 			  &__start___fw_ftr_fixup, &__stop___fw_ftr_fixup);
 	do_lwsync_fixups(cur_cpu_spec->cpu_features,
diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S
index 77fc76607ab2..b47d8ceffb52 100644
--- a/arch/powerpc/kernel/swsusp_32.S
+++ b/arch/powerpc/kernel/swsusp_32.S
@@ -5,7 +5,7 @@
 #include <asm/thread_info.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
-
+#include <asm/mmu.h>
 
 /*
  * Structure for storing CPU registers on the save area.
@@ -279,7 +279,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	mtibatl	3,r4
 #endif
 
-BEGIN_FTR_SECTION
+BEGIN_MMU_FTR_SECTION
 	li	r4,0
 	mtspr	SPRN_DBAT4U,r4
 	mtspr	SPRN_DBAT4L,r4
@@ -297,7 +297,7 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_IBAT6L,r4
 	mtspr	SPRN_IBAT7U,r4
 	mtspr	SPRN_IBAT7L,r4
-END_FTR_SECTION_IFSET(CPU_FTR_HAS_HIGH_BATS)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
 
 	/* Flush all TLBs */
 	lis	r4,0x1000
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 65639a43e644..e619d424f73d 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -567,6 +567,11 @@ static __init int vdso_fixup_features(struct lib32_elfinfo *v32,
 		do_feature_fixups(cur_cpu_spec->cpu_features,
 				  start64, start64 + size64);
 
+	start64 = find_section64(v64->hdr, "__mmu_ftr_fixup", &size64);
+	if (start64)
+		do_feature_fixups(cur_cpu_spec->mmu_features,
+				  start64, start64 + size64);
+
 	start64 = find_section64(v64->hdr, "__fw_ftr_fixup", &size64);
 	if (start64)
 		do_feature_fixups(powerpc_firmware_features,
@@ -583,6 +588,11 @@ static __init int vdso_fixup_features(struct lib32_elfinfo *v32,
 		do_feature_fixups(cur_cpu_spec->cpu_features,
 				  start32, start32 + size32);
 
+	start32 = find_section32(v32->hdr, "__mmu_ftr_fixup", &size32);
+	if (start32)
+		do_feature_fixups(cur_cpu_spec->mmu_features,
+				  start32, start32 + size32);
+
 #ifdef CONFIG_PPC64
 	start32 = find_section32(v32->hdr, "__fw_ftr_fixup", &size32);
 	if (start32)
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
index be3b6a41dc09..904ef1360dd7 100644
--- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -33,6 +33,9 @@ SECTIONS
 	. = ALIGN(8);
 	__ftr_fixup	: { *(__ftr_fixup) }
 
+	. = ALIGN(8);
+	__mmu_ftr_fixup	: { *(__mmu_ftr_fixup) }
+
 	. = ALIGN(8);
 	__lwsync_fixup	: { *(__lwsync_fixup) }
 
diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S
index d0b2526dd38d..0e615404e247 100644
--- a/arch/powerpc/kernel/vdso64/vdso64.lds.S
+++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S
@@ -34,6 +34,9 @@ SECTIONS
 	. = ALIGN(8);
 	__ftr_fixup	: { *(__ftr_fixup) }
 
+	. = ALIGN(8);
+	__mmu_ftr_fixup	: { *(__mmu_ftr_fixup) }
+
 	. = ALIGN(8);
 	__lwsync_fixup	: { *(__lwsync_fixup) }
 
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 2412c056baa4..47bf15cd2c9e 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -152,6 +152,12 @@ SECTIONS
 		__stop___ftr_fixup = .;
 	}
 	. = ALIGN(8);
+	__mmu_ftr_fixup : AT(ADDR(__mmu_ftr_fixup) - LOAD_OFFSET) {
+		__start___mmu_ftr_fixup = .;
+		*(__mmu_ftr_fixup)
+		__stop___mmu_ftr_fixup = .;
+	}
+	. = ALIGN(8);
 	__lwsync_fixup : AT(ADDR(__lwsync_fixup) - LOAD_OFFSET) {
 		__start___lwsync_fixup = .;
 		*(__lwsync_fixup)
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 6aa120813775..9d97db7b7cf7 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -192,7 +192,7 @@ void __init MMU_init_hw(void)
 	extern unsigned int hash_page[];
 	extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[];
 
-	if (!cpu_has_feature(CPU_FTR_HPTE_TABLE)) {
+	if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
 		/*
 		 * Put a blr (procedure return) instruction at the
 		 * start of hash_page, since we can still get DSI
diff --git a/arch/powerpc/platforms/powermac/sleep.S b/arch/powerpc/platforms/powermac/sleep.S
index adee28da353f..1c2802fabd57 100644
--- a/arch/powerpc/platforms/powermac/sleep.S
+++ b/arch/powerpc/platforms/powermac/sleep.S
@@ -17,6 +17,7 @@
 #include <asm/cache.h>
 #include <asm/thread_info.h>
 #include <asm/asm-offsets.h>
+#include <asm/mmu.h>
 
 #define MAGIC	0x4c617273	/* 'Lars' */
 
@@ -323,7 +324,7 @@ grackle_wake_up:
 	lwz	r4,SL_IBAT3+4(r1)
 	mtibatl	3,r4
 
-BEGIN_FTR_SECTION
+BEGIN_MMU_FTR_SECTION
 	li	r4,0
 	mtspr	SPRN_DBAT4U,r4
 	mtspr	SPRN_DBAT4L,r4
@@ -341,7 +342,7 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_IBAT6L,r4
 	mtspr	SPRN_IBAT7U,r4
 	mtspr	SPRN_IBAT7L,r4
-END_FTR_SECTION_IFSET(CPU_FTR_HAS_HIGH_BATS)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
 
 	/* Flush all TLBs */
 	lis	r4,0x1000
-- 
cgit v1.2.3


From f048aace29e007f2b642097e2da8231e0e9cce2d Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:38 +0000
Subject: powerpc/mm: Add SMP support to no-hash TLB handling

This commit moves the whole no-hash TLB handling out of line into a
new tlb_nohash.c file, and implements some basic SMP support using
IPIs and/or broadcast tlbivax instructions.

Note that I'm using local invalidations for D->I cache coherency.

At worst, if another processor is trying to execute the same and
has the old entry in its TLB, it will just take a fault and re-do
the TLB flush locally (it won't re-do the cache flush in any case).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/highmem.h  |   4 +-
 arch/powerpc/include/asm/mmu.h      |  16 +++
 arch/powerpc/include/asm/tlbflush.h |  84 +++++++--------
 arch/powerpc/kernel/misc_32.S       |   9 ++
 arch/powerpc/kernel/ppc_ksyms.c     |   6 --
 arch/powerpc/mm/Makefile            |   2 +-
 arch/powerpc/mm/fault.c             |   2 +-
 arch/powerpc/mm/mem.c               |   2 +-
 arch/powerpc/mm/tlb_hash32.c        |   4 +
 arch/powerpc/mm/tlb_nohash.c        | 209 ++++++++++++++++++++++++++++++++++++
 10 files changed, 281 insertions(+), 57 deletions(-)
 create mode 100644 arch/powerpc/mm/tlb_nohash.c

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h
index 7dc52eca8b67..fd97e501aa6a 100644
--- a/arch/powerpc/include/asm/highmem.h
+++ b/arch/powerpc/include/asm/highmem.h
@@ -85,7 +85,7 @@ static inline void *kmap_atomic_prot(struct page *page, enum km_type type, pgpro
 	BUG_ON(!pte_none(*(kmap_pte-idx)));
 #endif
 	__set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
-	local_flush_tlb_page(vaddr);
+	local_flush_tlb_page(NULL, vaddr);
 
 	return (void*) vaddr;
 }
@@ -113,7 +113,7 @@ static inline void kunmap_atomic(void *kvaddr, enum km_type type)
 	 * this pte without first remap it
 	 */
 	pte_clear(&init_mm, vaddr, kmap_pte-idx);
-	local_flush_tlb_page(vaddr);
+	local_flush_tlb_page(NULL, vaddr);
 #endif
 	pagefault_enable();
 }
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index dc8c0aef5e6c..6e7639911318 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -30,6 +30,22 @@
  */
 #define MMU_FTR_BIG_PHYS		ASM_CONST(0x00020000)
 
+/* Enable use of broadcast TLB invalidations. We don't always set it
+ * on processors that support it due to other constraints with the
+ * use of such invalidations
+ */
+#define MMU_FTR_USE_TLBIVAX_BCAST	ASM_CONST(0x00040000)
+
+/* Enable use of tlbilx invalidate-by-PID variant.
+ */
+#define MMU_FTR_USE_TLBILX_PID		ASM_CONST(0x00080000)
+
+/* This indicates that the processor cannot handle multiple outstanding
+ * broadcast tlbivax or tlbsync. This makes the code use a spinlock
+ * around such invalidate forms.
+ */
+#define MMU_FTR_LOCK_BCAST_INVAL	ASM_CONST(0x00100000)
+
 #ifndef __ASSEMBLY__
 #include <asm/cputable.h>
 
diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h
index 9ed363d3de44..8c39b27c1ed7 100644
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -6,7 +6,9 @@
  *
  *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
  *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - local_flush_tlb_page(vmaddr) flushes one page on the local processor
+ *  - local_flush_tlb_mm(mm) flushes the specified mm context on
+ *                           the local processor
+ *  - local_flush_tlb_page(vma, vmaddr) flushes one page on the local processor
  *  - flush_tlb_page_nohash(vma, vmaddr) flushes one page if SW loaded TLB
  *  - flush_tlb_range(vma, start, end) flushes a range of pages
  *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
@@ -18,7 +20,7 @@
  */
 #ifdef __KERNEL__
 
-#if defined(CONFIG_4xx) || defined(CONFIG_8xx) || defined(CONFIG_FSL_BOOKE)
+#ifdef CONFIG_PPC_MMU_NOHASH
 /*
  * TLB flushing for software loaded TLB chips
  *
@@ -31,10 +33,10 @@
 
 #define MMU_NO_CONTEXT      	((unsigned int)-1)
 
-extern void _tlbie(unsigned long address, unsigned int pid);
 extern void _tlbil_all(void);
 extern void _tlbil_pid(unsigned int pid);
 extern void _tlbil_va(unsigned long address, unsigned int pid);
+extern void _tlbivax_bcast(unsigned long address, unsigned int pid);
 
 #if defined(CONFIG_40x) || defined(CONFIG_8xx)
 #define _tlbia()	asm volatile ("tlbia; sync" : : : "memory")
@@ -42,48 +44,26 @@ extern void _tlbil_va(unsigned long address, unsigned int pid);
 extern void _tlbia(void);
 #endif
 
-static inline void local_flush_tlb_mm(struct mm_struct *mm)
-{
-	_tlbil_pid(mm->context.id);
-}
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
-	_tlbil_pid(mm->context.id);
-}
-
-static inline void local_flush_tlb_page(unsigned long vmaddr)
-{
-	_tlbil_va(vmaddr, 0);
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-				  unsigned long vmaddr)
-{
-	_tlbil_va(vmaddr, vma ? vma->vm_mm->context.id : 0);
-}
+extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+			    unsigned long end);
+extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
-static inline void flush_tlb_page_nohash(struct vm_area_struct *vma,
-					 unsigned long vmaddr)
-{
-	flush_tlb_page(vma, vmaddr);
-}
+extern void local_flush_tlb_mm(struct mm_struct *mm);
+extern void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end)
-{
-	_tlbil_pid(vma->vm_mm->context.id);
-}
+#ifdef CONFIG_SMP
+extern void flush_tlb_mm(struct mm_struct *mm);
+extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+#else
+#define flush_tlb_mm(mm)		local_flush_tlb_mm(mm)
+#define flush_tlb_page(vma,addr)	local_flush_tlb_page(vma,addr)
+#endif
+#define flush_tlb_page_nohash(vma,addr)	flush_tlb_page(vma,addr)
 
-static inline void flush_tlb_kernel_range(unsigned long start,
-					  unsigned long end)
-{
-	_tlbil_pid(0);
-}
+#elif defined(CONFIG_PPC_STD_MMU_32)
 
-#elif defined(CONFIG_PPC32)
 /*
- * TLB flushing for "classic" hash-MMMU 32-bit CPUs, 6xx, 7xx, 7xxx
+ * TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx
  */
 extern void _tlbie(unsigned long address);
 extern void _tlbia(void);
@@ -94,14 +74,20 @@ extern void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr
 extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 			    unsigned long end);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
-static inline void local_flush_tlb_page(unsigned long vmaddr)
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
+					unsigned long vmaddr)
 {
-	flush_tlb_page(NULL, vmaddr);
+	flush_tlb_page(vma, vmaddr);
+}
+static inline void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	flush_tlb_mm(mm);
 }
 
-#else
+#elif defined(CONFIG_PPC_STD_MMU_64)
+
 /*
- * TLB flushing for 64-bit has-MMU CPUs
+ * TLB flushing for 64-bit hash-MMU CPUs
  */
 
 #include <linux/percpu.h>
@@ -151,11 +137,16 @@ extern void flush_hash_page(unsigned long va, real_pte_t pte, int psize,
 extern void flush_hash_range(unsigned long number, int local);
 
 
+static inline void local_flush_tlb_mm(struct mm_struct *mm)
+{
+}
+
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
 }
 
-static inline void local_flush_tlb_page(unsigned long vmaddr)
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
+					unsigned long vmaddr)
 {
 }
 
@@ -183,7 +174,8 @@ static inline void flush_tlb_kernel_range(unsigned long start,
 extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 				     unsigned long end);
 
-
+#else
+#error Unsupported MMU type
 #endif
 
 #endif /*__KERNEL__ */
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 5c33bc14bd9f..2c2ab89f0b64 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -29,6 +29,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/processor.h>
 #include <asm/kexec.h>
+#include <asm/bug.h>
 
 	.text
 
@@ -496,6 +497,14 @@ _GLOBAL(_tlbil_va)
 	blr
 #endif /* CONFIG_FSL_BOOKE */
 
+/*
+ * Nobody implements this yet
+ */
+_GLOBAL(_tlbivax_bcast)
+1:	trap
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
+	blr
+
 
 /*
  * Flush instruction cache.
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 341b3d3048e0..dcec1325d340 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -116,12 +116,6 @@ EXPORT_SYMBOL(giveup_spe);
 
 #ifndef CONFIG_PPC64
 EXPORT_SYMBOL(flush_instruction_cache);
-EXPORT_SYMBOL(flush_tlb_kernel_range);
-EXPORT_SYMBOL(flush_tlb_page);
-EXPORT_SYMBOL(_tlbie);
-#if defined(CONFIG_4xx) || defined(CONFIG_8xx) || defined(CONFIG_FSL_BOOKE)
-EXPORT_SYMBOL(_tlbil_va);
-#endif
 #endif
 EXPORT_SYMBOL(__flush_icache_range);
 EXPORT_SYMBOL(flush_dcache_range);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 923bd3fa7d64..af987df8d5a3 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -9,7 +9,7 @@ endif
 obj-y				:= fault.o mem.o pgtable.o \
 				   init_$(CONFIG_WORD_SIZE).o \
 				   pgtable_$(CONFIG_WORD_SIZE).o
-obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o
+obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o
 hash-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
 				   slb_low.o slb.o stab.o \
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7df0409107ad..87f1f955dea4 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -284,7 +284,7 @@ good_area:
 				}
 				pte_update(ptep, 0, _PAGE_HWEXEC |
 					   _PAGE_ACCESSED);
-				_tlbie(address, mm->context.id);
+				local_flush_tlb_page(vma, address);
 				pte_unmap_unlock(ptep, ptl);
 				up_read(&mm->mmap_sem);
 				return 0;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index b9e1a1da6e52..8fee696fb795 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -488,7 +488,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		 * we invalidate the TLB here, thus avoiding dcbst
 		 * misbehaviour.
 		 */
-		_tlbie(address, 0 /* 8xx doesn't care about PID */);
+		_tlbil_va(address, 0 /* 8xx doesn't care about PID */);
 #endif
 		/* The _PAGE_USER test should really be _PAGE_EXEC, but
 		 * older glibc versions execute some code from no-exec
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index f9a47fee3927..65190587a365 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -137,6 +137,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 	flush_range(&init_mm, start, end);
 	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_kernel_range);
 
 /*
  * Flush all the (user) entries for the address space described by mm.
@@ -160,6 +161,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
 	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_mm);
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
@@ -176,6 +178,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
 	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_page);
 
 /*
  * For each address in the range, find the pte for the address
@@ -188,3 +191,4 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	flush_range(vma->vm_mm, start, end);
 	FINISH_FLUSH;
 }
+EXPORT_SYMBOL(flush_tlb_range);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
new file mode 100644
index 000000000000..803a64c02b06
--- /dev/null
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -0,0 +1,209 @@
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU does not use a hash table to store virtual to
+ * physical translations (ie, SW loaded TLBs or Book3E compilant processors,
+ * this does -not- include 603 however which shares the implementation with
+ * hash based processors)
+ *
+ *  -- BenH
+ *
+ * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
+ *                IBM Corp.
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#include "mmu_decl.h"
+
+/*
+ * Base TLB flushing operations:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ *  - local_* variants of page and mm only apply to the current
+ *    processor
+ */
+
+/*
+ * These are the base non-SMP variants of page and mm flushing
+ */
+void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbil_pid(pid);
+	preempt_enable();
+}
+EXPORT_SYMBOL(local_flush_tlb_mm);
+
+void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	unsigned int pid;
+
+	preempt_disable();
+	pid = vma ? vma->vm_mm->context.id : 0;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbil_va(vmaddr, pid);
+	preempt_enable();
+}
+EXPORT_SYMBOL(local_flush_tlb_page);
+
+
+/*
+ * And here are the SMP non-local implementations
+ */
+#ifdef CONFIG_SMP
+
+static DEFINE_SPINLOCK(tlbivax_lock);
+
+struct tlb_flush_param {
+	unsigned long addr;
+	unsigned int pid;
+};
+
+static void do_flush_tlb_mm_ipi(void *param)
+{
+	struct tlb_flush_param *p = param;
+
+	_tlbil_pid(p ? p->pid : 0);
+}
+
+static void do_flush_tlb_page_ipi(void *param)
+{
+	struct tlb_flush_param *p = param;
+
+	_tlbil_va(p->addr, p->pid);
+}
+
+
+/* Note on invalidations and PID:
+ *
+ * We snapshot the PID with preempt disabled. At this point, it can still
+ * change either because:
+ * - our context is being stolen (PID -> NO_CONTEXT) on another CPU
+ * - we are invaliating some target that isn't currently running here
+ *   and is concurrently acquiring a new PID on another CPU
+ * - some other CPU is re-acquiring a lost PID for this mm
+ * etc...
+ *
+ * However, this shouldn't be a problem as we only guarantee
+ * invalidation of TLB entries present prior to this call, so we
+ * don't care about the PID changing, and invalidating a stale PID
+ * is generally harmless.
+ */
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	cpumask_t cpu_mask;
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto no_context;
+	cpu_mask = mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+	if (!cpus_empty(cpu_mask)) {
+		struct tlb_flush_param p = { .pid = pid };
+		smp_call_function_mask(cpu_mask, do_flush_tlb_mm_ipi, &p, 1);
+	}
+	_tlbil_pid(pid);
+ no_context:
+	preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_mm);
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	cpumask_t cpu_mask;
+	unsigned int pid;
+
+	preempt_disable();
+	pid = vma ? vma->vm_mm->context.id : 0;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto bail;
+	cpu_mask = vma->vm_mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+	if (!cpus_empty(cpu_mask)) {
+		/* If broadcast tlbivax is supported, use it */
+		if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
+			int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
+			if (lock)
+				spin_lock(&tlbivax_lock);
+			_tlbivax_bcast(vmaddr, pid);
+			if (lock)
+				spin_unlock(&tlbivax_lock);
+			goto bail;
+		} else {
+			struct tlb_flush_param p = { .pid = pid, .addr = vmaddr };
+			smp_call_function_mask(cpu_mask,
+					       do_flush_tlb_page_ipi, &p, 1);
+		}
+	}
+	_tlbil_va(vmaddr, pid);
+ bail:
+	preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+#ifdef CONFIG_SMP
+	preempt_disable();
+	smp_call_function(do_flush_tlb_mm_ipi, NULL, 1);
+	_tlbil_pid(0);
+	preempt_enable();
+#endif
+	_tlbil_pid(0);
+}
+EXPORT_SYMBOL(flush_tlb_kernel_range);
+
+/*
+ * Currently, for range flushing, we just do a full mm flush. This should
+ * be optimized based on a threshold on the size of the range, since
+ * some implementation can stack multiple tlbivax before a tlbsync but
+ * for now, we keep it that way
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+
+{
+	flush_tlb_mm(vma->vm_mm);
+}
+EXPORT_SYMBOL(flush_tlb_range);
-- 
cgit v1.2.3


From 2a4aca1144394653269720ffbb5a325a77abd5fa Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:42 +0000
Subject: powerpc/mm: Split low level tlb invalidate for nohash processors

Currently, the various forms of low level TLB invalidations are all
implemented in misc_32.S for 32-bit processors, in a fairly scary
mess of #ifdef's and with interesting duplication such as a whole
bunch of code for FSL _tlbie and _tlbia which are no longer used.

This moves things around such that _tlbie is now defined in
hash_low_32.S and is only used by the 32-bit hash code, and all
nohash CPUs use the various _tlbil_* forms that are now moved to
a new file, tlb_nohash_low.S.

I moved all the definitions for that stuff out of
include/asm/tlbflush.h as they are really internal mm stuff, into
mm/mmu_decl.h

The code should have no functional changes.  I kept some variants
inline for trivial forms on things like 40x and 8xx.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/tlbflush.h |  14 ---
 arch/powerpc/kernel/misc_32.S       | 233 ------------------------------------
 arch/powerpc/kvm/powerpc.c          |   2 +-
 arch/powerpc/mm/Makefile            |   3 +-
 arch/powerpc/mm/hash_low_32.S       |  76 ++++++++++++
 arch/powerpc/mm/mmu_decl.h          |  48 ++++++++
 arch/powerpc/mm/tlb_nohash_low.S    | 165 +++++++++++++++++++++++++
 7 files changed, 292 insertions(+), 249 deletions(-)
 create mode 100644 arch/powerpc/mm/tlb_nohash_low.S

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h
index 8c39b27c1ed7..abbe3419d1dd 100644
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -33,17 +33,6 @@
 
 #define MMU_NO_CONTEXT      	((unsigned int)-1)
 
-extern void _tlbil_all(void);
-extern void _tlbil_pid(unsigned int pid);
-extern void _tlbil_va(unsigned long address, unsigned int pid);
-extern void _tlbivax_bcast(unsigned long address, unsigned int pid);
-
-#if defined(CONFIG_40x) || defined(CONFIG_8xx)
-#define _tlbia()	asm volatile ("tlbia; sync" : : : "memory")
-#else /* CONFIG_44x || CONFIG_FSL_BOOKE */
-extern void _tlbia(void);
-#endif
-
 extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 			    unsigned long end);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
@@ -65,9 +54,6 @@ extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 /*
  * TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx
  */
-extern void _tlbie(unsigned long address);
-extern void _tlbia(void);
-
 extern void flush_tlb_mm(struct mm_struct *mm);
 extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 extern void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr);
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 2c2ab89f0b64..ae0d084b6a24 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -272,239 +272,6 @@ _GLOBAL(real_writeb)
 
 #endif /* CONFIG_40x */
 
-/*
- * Flush MMU TLB
- */
-#ifndef CONFIG_FSL_BOOKE
-_GLOBAL(_tlbil_all)
-_GLOBAL(_tlbil_pid)
-#endif
-_GLOBAL(_tlbia)
-#if defined(CONFIG_40x)
-	sync			/* Flush to memory before changing mapping */
-	tlbia
-	isync			/* Flush shadow TLB */
-#elif defined(CONFIG_44x)
-	li	r3,0
-	sync
-
-	/* Load high watermark */
-	lis	r4,tlb_44x_hwater@ha
-	lwz	r5,tlb_44x_hwater@l(r4)
-
-1:	tlbwe	r3,r3,PPC44x_TLB_PAGEID
-	addi	r3,r3,1
-	cmpw	0,r3,r5
-	ble	1b
-
-	isync
-#elif defined(CONFIG_FSL_BOOKE)
-	/* Invalidate all entries in TLB0 */
-	li	r3, 0x04
-	tlbivax	0,3
-	/* Invalidate all entries in TLB1 */
-	li	r3, 0x0c
-	tlbivax	0,3
-	msync
-#ifdef CONFIG_SMP
-	tlbsync
-#endif /* CONFIG_SMP */
-#else /* !(CONFIG_40x || CONFIG_44x || CONFIG_FSL_BOOKE) */
-#if defined(CONFIG_SMP)
-	rlwinm	r8,r1,0,0,(31-THREAD_SHIFT)
-	lwz	r8,TI_CPU(r8)
-	oris	r8,r8,10
-	mfmsr	r10
-	SYNC
-	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
-	rlwinm	r0,r0,0,28,26		/* clear DR */
-	mtmsr	r0
-	SYNC_601
-	isync
-	lis	r9,mmu_hash_lock@h
-	ori	r9,r9,mmu_hash_lock@l
-	tophys(r9,r9)
-10:	lwarx	r7,0,r9
-	cmpwi	0,r7,0
-	bne-	10b
-	stwcx.	r8,0,r9
-	bne-	10b
-	sync
-	tlbia
-	sync
-	TLBSYNC
-	li	r0,0
-	stw	r0,0(r9)		/* clear mmu_hash_lock */
-	mtmsr	r10
-	SYNC_601
-	isync
-#else /* CONFIG_SMP */
-	sync
-	tlbia
-	sync
-#endif /* CONFIG_SMP */
-#endif /* ! defined(CONFIG_40x) */
-	blr
-
-/*
- * Flush MMU TLB for a particular address
- */
-#ifndef CONFIG_FSL_BOOKE
-_GLOBAL(_tlbil_va)
-#endif
-_GLOBAL(_tlbie)
-#if defined(CONFIG_40x)
-	/* We run the search with interrupts disabled because we have to change
-	 * the PID and I don't want to preempt when that happens.
-	 */
-	mfmsr	r5
-	mfspr	r6,SPRN_PID
-	wrteei	0
-	mtspr	SPRN_PID,r4
-	tlbsx.	r3, 0, r3
-	mtspr	SPRN_PID,r6
-	wrtee	r5
-	bne	10f
-	sync
-	/* There are only 64 TLB entries, so r3 < 64, which means bit 25 is clear.
-	 * Since 25 is the V bit in the TLB_TAG, loading this value will invalidate
-	 * the TLB entry. */
-	tlbwe	r3, r3, TLB_TAG
-	isync
-10:
-
-#elif defined(CONFIG_44x)
-	mfspr	r5,SPRN_MMUCR
-	rlwimi	r5,r4,0,24,31			/* Set TID */
-
-	/* We have to run the search with interrupts disabled, even critical
-	 * and debug interrupts (in fact the only critical exceptions we have
-	 * are debug and machine check).  Otherwise  an interrupt which causes
-	 * a TLB miss can clobber the MMUCR between the mtspr and the tlbsx. */
-	mfmsr	r4
-	lis	r6,(MSR_EE|MSR_CE|MSR_ME|MSR_DE)@ha
-	addi	r6,r6,(MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
-	andc	r6,r4,r6
-	mtmsr	r6
-	mtspr	SPRN_MMUCR,r5
-	tlbsx.	r3, 0, r3
-	mtmsr	r4
-	bne	10f
-	sync
-	/* There are only 64 TLB entries, so r3 < 64,
-	 * which means bit 22, is clear.  Since 22 is
-	 * the V bit in the TLB_PAGEID, loading this
-	 * value will invalidate the TLB entry.
-	 */
-	tlbwe	r3, r3, PPC44x_TLB_PAGEID
-	isync
-10:
-#elif defined(CONFIG_FSL_BOOKE)
-	rlwinm	r4, r3, 0, 0, 19
-	ori	r5, r4, 0x08	/* TLBSEL = 1 */
-	tlbivax	0, r4
-	tlbivax	0, r5
-	msync
-#if defined(CONFIG_SMP)
-	tlbsync
-#endif /* CONFIG_SMP */
-#else /* !(CONFIG_40x || CONFIG_44x || CONFIG_FSL_BOOKE) */
-#if defined(CONFIG_SMP)
-	rlwinm	r8,r1,0,0,(31-THREAD_SHIFT)
-	lwz	r8,TI_CPU(r8)
-	oris	r8,r8,11
-	mfmsr	r10
-	SYNC
-	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
-	rlwinm	r0,r0,0,28,26		/* clear DR */
-	mtmsr	r0
-	SYNC_601
-	isync
-	lis	r9,mmu_hash_lock@h
-	ori	r9,r9,mmu_hash_lock@l
-	tophys(r9,r9)
-10:	lwarx	r7,0,r9
-	cmpwi	0,r7,0
-	bne-	10b
-	stwcx.	r8,0,r9
-	bne-	10b
-	eieio
-	tlbie	r3
-	sync
-	TLBSYNC
-	li	r0,0
-	stw	r0,0(r9)		/* clear mmu_hash_lock */
-	mtmsr	r10
-	SYNC_601
-	isync
-#else /* CONFIG_SMP */
-	tlbie	r3
-	sync
-#endif /* CONFIG_SMP */
-#endif /* ! CONFIG_40x */
-	blr
-
-#if defined(CONFIG_FSL_BOOKE)
-/*
- * Flush MMU TLB, but only on the local processor (no broadcast)
- */
-_GLOBAL(_tlbil_all)
-#define MMUCSR0_TLBFI	(MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \
-			 MMUCSR0_TLB2FI | MMUCSR0_TLB3FI)
-	li	r3,(MMUCSR0_TLBFI)@l
-	mtspr	SPRN_MMUCSR0, r3
-1:
-	mfspr	r3,SPRN_MMUCSR0
-	andi.	r3,r3,MMUCSR0_TLBFI@l
-	bne	1b
-	blr
-
-/*
- * Flush MMU TLB for a particular process id, but only on the local processor
- * (no broadcast)
- */
-_GLOBAL(_tlbil_pid)
-/* we currently do an invalidate all since we don't have per pid invalidate */
-	li	r3,(MMUCSR0_TLBFI)@l
-	mtspr	SPRN_MMUCSR0, r3
-1:
-	mfspr	r3,SPRN_MMUCSR0
-	andi.	r3,r3,MMUCSR0_TLBFI@l
-	bne	1b
-	msync
-	isync
-	blr
-
-/*
- * Flush MMU TLB for a particular address, but only on the local processor
- * (no broadcast)
- */
-_GLOBAL(_tlbil_va)
-	mfmsr	r10
-	wrteei	0
-	slwi	r4,r4,16
-	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
-	tlbsx	0,r3
-	mfspr	r4,SPRN_MAS1		/* check valid */
-	andis.	r3,r4,MAS1_VALID@h
-	beq	1f
-	rlwinm	r4,r4,0,1,31
-	mtspr	SPRN_MAS1,r4
-	tlbwe
-	msync
-	isync
-1:	wrtee	r10
-	blr
-#endif /* CONFIG_FSL_BOOKE */
-
-/*
- * Nobody implements this yet
- */
-_GLOBAL(_tlbivax_bcast)
-1:	trap
-	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
-	blr
-
 
 /*
  * Flush instruction cache.
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index fda9baada132..eb955d755c9a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -330,7 +330,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	/* XXX It would be nice to differentiate between heavyweight exit and
 	 * sched_out here, since we could avoid the TLB flush for heavyweight
 	 * exits. */
-	_tlbia();
+	_tlbil_all();
 }
 
 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index af987df8d5a3..953cc4a1cde5 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -9,7 +9,8 @@ endif
 obj-y				:= fault.o mem.o pgtable.o \
 				   init_$(CONFIG_WORD_SIZE).o \
 				   pgtable_$(CONFIG_WORD_SIZE).o
-obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o
+obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
+				   tlb_nohash_low.o
 hash-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
 				   slb_low.o slb.o stab.o \
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index c5536b8b37a9..c8eac22a8f00 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -633,3 +633,79 @@ _GLOBAL(flush_hash_patch_B)
 	SYNC_601
 	isync
 	blr
+
+/*
+ * Flush an entry from the TLB
+ */
+_GLOBAL(_tlbie)
+#ifdef CONFIG_SMP
+	rlwinm	r8,r1,0,0,(31-THREAD_SHIFT)
+	lwz	r8,TI_CPU(r8)
+	oris	r8,r8,11
+	mfmsr	r10
+	SYNC
+	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+	lis	r9,mmu_hash_lock@h
+	ori	r9,r9,mmu_hash_lock@l
+	tophys(r9,r9)
+10:	lwarx	r7,0,r9
+	cmpwi	0,r7,0
+	bne-	10b
+	stwcx.	r8,0,r9
+	bne-	10b
+	eieio
+	tlbie	r3
+	sync
+	TLBSYNC
+	li	r0,0
+	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	mtmsr	r10
+	SYNC_601
+	isync
+#else /* CONFIG_SMP */
+	tlbie	r3
+	sync
+#endif /* CONFIG_SMP */
+	blr
+
+/*
+ * Flush the entire TLB. 603/603e only
+ */
+_GLOBAL(_tlbia)
+#if defined(CONFIG_SMP)
+	rlwinm	r8,r1,0,0,(31-THREAD_SHIFT)
+	lwz	r8,TI_CPU(r8)
+	oris	r8,r8,10
+	mfmsr	r10
+	SYNC
+	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+	lis	r9,mmu_hash_lock@h
+	ori	r9,r9,mmu_hash_lock@l
+	tophys(r9,r9)
+10:	lwarx	r7,0,r9
+	cmpwi	0,r7,0
+	bne-	10b
+	stwcx.	r8,0,r9
+	bne-	10b
+	sync
+	tlbia
+	sync
+	TLBSYNC
+	li	r0,0
+	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	mtmsr	r10
+	SYNC_601
+	isync
+#else /* CONFIG_SMP */
+	sync
+	tlbia
+	sync
+#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index b4344fd30f2a..4314b39b6faf 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -22,10 +22,58 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu.h>
 
+#ifdef CONFIG_PPC_MMU_NOHASH
+
+/*
+ * On 40x and 8xx, we directly inline tlbia and tlbivax
+ */
+#if defined(CONFIG_40x) || defined(CONFIG_8xx)
+static inline void _tlbil_all(void)
+{
+	asm volatile ("sync; tlbia; isync" : : : "memory")
+}
+static inline void _tlbil_pid(unsigned int pid)
+{
+	asm volatile ("sync; tlbia; isync" : : : "memory")
+}
+#else /* CONFIG_40x || CONFIG_8xx */
+extern void _tlbil_all(void);
+extern void _tlbil_pid(unsigned int pid);
+#endif /* !(CONFIG_40x || CONFIG_8xx) */
+
+/*
+ * On 8xx, we directly inline tlbie, on others, it's extern
+ */
+#ifdef CONFIG_8xx
+static inline void _tlbil_va(unsigned long address, unsigned int pid)
+{
+	asm volatile ("tlbie %0; sync" : : "r" (address) : "memory")
+}
+#else /* CONFIG_8xx */
+extern void _tlbil_va(unsigned long address, unsigned int pid);
+#endif /* CONIFG_8xx */
+
+/*
+ * As of today, we don't support tlbivax broadcast on any
+ * implementation. When that becomes the case, this will be
+ * an extern.
+ */
+static inline void _tlbivax_bcast(unsigned long address, unsigned int pid)
+{
+	BUG();
+}
+
+#else /* CONFIG_PPC_MMU_NOHASH */
+
 extern void hash_preload(struct mm_struct *mm, unsigned long ea,
 			 unsigned long access, unsigned long trap);
 
 
+extern void _tlbie(unsigned long address);
+extern void _tlbia(void);
+
+#endif /* CONFIG_PPC_MMU_NOHASH */
+
 #ifdef CONFIG_PPC32
 extern void mapin_ram(void);
 extern int map_page(unsigned long va, phys_addr_t pa, int flags);
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
new file mode 100644
index 000000000000..763c59fe0076
--- /dev/null
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -0,0 +1,165 @@
+/*
+ * This file contains low-level functions for performing various
+ * types of TLB invalidations on various processors with no hash
+ * table.
+ *
+ * This file implements the following functions for all no-hash
+ * processors. Some aren't implemented for some variants. Some
+ * are inline in tlbflush.h
+ *
+ *	- tlbil_va
+ *	- tlbil_pid
+ *	- tlbil_all
+ *	- tlbivax_bcast (not yet)
+ *
+ * Code mostly moved over from misc_32.S
+ *
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Partially rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor.h>
+
+#if defined(CONFIG_40x)
+
+/*
+ * 40x implementation needs only tlbil_va
+ */
+_GLOBAL(_tlbil_va)
+	/* We run the search with interrupts disabled because we have to change
+	 * the PID and I don't want to preempt when that happens.
+	 */
+	mfmsr	r5
+	mfspr	r6,SPRN_PID
+	wrteei	0
+	mtspr	SPRN_PID,r4
+	tlbsx.	r3, 0, r3
+	mtspr	SPRN_PID,r6
+	wrtee	r5
+	bne	1f
+	sync
+	/* There are only 64 TLB entries, so r3 < 64, which means bit 25 is
+	 * clear. Since 25 is the V bit in the TLB_TAG, loading this value
+	 * will invalidate the TLB entry. */
+	tlbwe	r3, r3, TLB_TAG
+	isync
+1:	blr
+
+#elif defined(CONFIG_8xx)
+
+/*
+ * Nothing to do for 8xx, everything is inline
+ */
+
+#elif defined(CONFIG_44x)
+
+/*
+ * 440 implementation uses tlbsx/we for tlbil_va and a full sweep
+ * of the TLB for everything else.
+ */
+_GLOBAL(_tlbil_va)
+	mfspr	r5,SPRN_MMUCR
+	rlwimi	r5,r4,0,24,31			/* Set TID */
+
+	/* We have to run the search with interrupts disabled, even critical
+	 * and debug interrupts (in fact the only critical exceptions we have
+	 * are debug and machine check).  Otherwise  an interrupt which causes
+	 * a TLB miss can clobber the MMUCR between the mtspr and the tlbsx. */
+	mfmsr	r4
+	lis	r6,(MSR_EE|MSR_CE|MSR_ME|MSR_DE)@ha
+	addi	r6,r6,(MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
+	andc	r6,r4,r6
+	mtmsr	r6
+	mtspr	SPRN_MMUCR,r5
+	tlbsx.	r3, 0, r3
+	mtmsr	r4
+	bne	1f
+	sync
+	/* There are only 64 TLB entries, so r3 < 64,
+	 * which means bit 22, is clear.  Since 22 is
+	 * the V bit in the TLB_PAGEID, loading this
+	 * value will invalidate the TLB entry.
+	 */
+	tlbwe	r3, r3, PPC44x_TLB_PAGEID
+	isync
+1:	blr
+
+_GLOBAL(_tlbil_all)
+_GLOBAL(_tlbil_pid)
+	li	r3,0
+	sync
+
+	/* Load high watermark */
+	lis	r4,tlb_44x_hwater@ha
+	lwz	r5,tlb_44x_hwater@l(r4)
+
+1:	tlbwe	r3,r3,PPC44x_TLB_PAGEID
+	addi	r3,r3,1
+	cmpw	0,r3,r5
+	ble	1b
+
+	isync
+	blr
+
+#elif defined(CONFIG_FSL_BOOKE)
+/*
+ * FSL BookE implementations. Currently _pid and _all are the
+ * same. This will change when tlbilx is actually supported and
+ * performs invalidate-by-PID. This change will be driven by
+ * mmu_features conditional
+ */
+
+/*
+ * Flush MMU TLB on the local processor
+ */
+_GLOBAL(_tlbil_pid)
+_GLOBAL(_tlbil_all)
+#define MMUCSR0_TLBFI	(MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \
+			 MMUCSR0_TLB2FI | MMUCSR0_TLB3FI)
+	li	r3,(MMUCSR0_TLBFI)@l
+	mtspr	SPRN_MMUCSR0, r3
+1:
+	mfspr	r3,SPRN_MMUCSR0
+	andi.	r3,r3,MMUCSR0_TLBFI@l
+	bne	1b
+	msync
+	isync
+	blr
+
+/*
+ * Flush MMU TLB for a particular address, but only on the local processor
+ * (no broadcast)
+ */
+_GLOBAL(_tlbil_va)
+	mfmsr	r10
+	wrteei	0
+	slwi	r4,r4,16
+	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+	tlbsx	0,r3
+	mfspr	r4,SPRN_MAS1		/* check valid */
+	andis.	r3,r4,MAS1_VALID@h
+	beq	1f
+	rlwinm	r4,r4,0,1,31
+	mtspr	SPRN_MAS1,r4
+	tlbwe
+	msync
+	isync
+1:	wrtee	r10
+	blr
+#elif
+#error Unsupported processor type !
+#endif
-- 
cgit v1.2.3


From 760ec0e02d8a13d0ed60d99f47879d4aa8ef1910 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:46 +0000
Subject: powerpc/44x: No need to mask MSR:CE, ME or DE in _tlbil_va on 440

The handlers for Critical, Machine Check or Debug interrupts
will save and restore MMUCR nowadays, thus we only need to
disable normal interrupts when invalidating TLB entries.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Acked-by: Josh Boyer <jwboyer@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/tlb_nohash_low.S | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index 763c59fe0076..f900a39e6ec4 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -75,18 +75,19 @@ _GLOBAL(_tlbil_va)
 	mfspr	r5,SPRN_MMUCR
 	rlwimi	r5,r4,0,24,31			/* Set TID */
 
-	/* We have to run the search with interrupts disabled, even critical
-	 * and debug interrupts (in fact the only critical exceptions we have
-	 * are debug and machine check).  Otherwise  an interrupt which causes
-	 * a TLB miss can clobber the MMUCR between the mtspr and the tlbsx. */
+	/* We have to run the search with interrupts disabled, otherwise
+	 * an interrupt which causes a TLB miss can clobber the MMUCR
+	 * between the mtspr and the tlbsx.
+	 *
+	 * Critical and Machine Check interrupts take care of saving
+	 * and restoring MMUCR, so only normal interrupts have to be
+	 * taken care of.
+	 */
 	mfmsr	r4
-	lis	r6,(MSR_EE|MSR_CE|MSR_ME|MSR_DE)@ha
-	addi	r6,r6,(MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
-	andc	r6,r4,r6
-	mtmsr	r6
+	wrteei	0
 	mtspr	SPRN_MMUCR,r5
 	tlbsx.	r3, 0, r3
-	mtmsr	r4
+	wrtee	r4
 	bne	1f
 	sync
 	/* There are only 64 TLB entries, so r3 < 64,
-- 
cgit v1.2.3


From 77520351805cc19ba37394ae33f862ef6d3c2a23 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:48 +0000
Subject: powerpc/mm: Runtime allocation of mmu context maps for nohash CPUs

This makes the MMU context code used for CPUs with no hash table
(except 603) dynamically allocate the various maps used to track
the state of contexts.

Only the main free map and CPU 0 stale map are allocated at boot
time.  Other CPU maps are allocated when those CPUs are brought up
and freed if they are unplugged.

This also moves the initialization of the MMU context management
slightly later during the boot process, which should be fine as
it's really only needed when userland if first started anyways.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/setup_32.c       |   5 ++
 arch/powerpc/mm/init_32.c            |   4 -
 arch/powerpc/mm/mmu_context_nohash.c | 161 ++++++++++++++++++++++++-----------
 3 files changed, 116 insertions(+), 54 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index b14c2a3e2185..d72ef39f2b37 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -38,6 +38,7 @@
 #include <asm/time.h>
 #include <asm/serial.h>
 #include <asm/udbg.h>
+#include <asm/mmu_context.h>
 
 #include "setup.h"
 
@@ -330,4 +331,8 @@ void __init setup_arch(char **cmdline_p)
 	if ( ppc_md.progress ) ppc_md.progress("arch: exit", 0x3eab);
 
 	paging_init();
+
+	/* Initialize the MMU context management stuff */
+	mmu_context_init();
+
 }
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 388ceda632f3..578294c3b1ce 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -35,7 +35,6 @@
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
 #include <asm/io.h>
-#include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/smp.h>
@@ -180,9 +179,6 @@ void __init MMU_init(void)
 	if (ppc_md.progress)
 		ppc_md.progress("MMU:setio", 0x302);
 
-	/* Initialize the context management stuff */
-	mmu_context_init();
-
 	if (ppc_md.progress)
 		ppc_md.progress("MMU:exit", 0x211);
 
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 8b5de52de0ad..52a0cfc38b64 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -28,54 +28,30 @@
 #undef DEBUG
 #define DEBUG_STEAL_ONLY
 #undef DEBUG_MAP_CONSISTENCY
+/*#define DEBUG_CLAMP_LAST_CONTEXT   15 */
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
 
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
-#include <linux/spinlock.h>
-
-/*
- *   The MPC8xx has only 16 contexts.  We rotate through them on each
- * task switch.  A better way would be to keep track of tasks that
- * own contexts, and implement an LRU usage.  That way very active
- * tasks don't always have to pay the TLB reload overhead.  The
- * kernel pages are mapped shared, so the kernel can run on behalf
- * of any task that makes a kernel entry.  Shared does not mean they
- * are not protected, just that the ASID comparison is not performed.
- *      -- Dan
- *
- * The IBM4xx has 256 contexts, so we can just rotate through these
- * as a way of "switching" contexts.  If the TID of the TLB is zero,
- * the PID/TID comparison is disabled, so we can use a TID of zero
- * to represent all kernel pages as shared among all contexts.
- * 	-- Dan
- */
-
-#ifdef CONFIG_8xx
-#define LAST_CONTEXT    	15
-#define FIRST_CONTEXT    	0
-
-#elif defined(CONFIG_4xx)
-#define LAST_CONTEXT    	255
-#define FIRST_CONTEXT    	1
-
-#elif defined(CONFIG_E200) || defined(CONFIG_E500)
-#define LAST_CONTEXT    	255
-#define FIRST_CONTEXT    	1
-
-#else
-#error Unsupported processor type
-#endif
 
+static unsigned int first_context, last_context;
 static unsigned int next_context, nr_free_contexts;
-static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-static unsigned long stale_map[NR_CPUS][LAST_CONTEXT / BITS_PER_LONG + 1];
-static struct mm_struct *context_mm[LAST_CONTEXT+1];
+static unsigned long *context_map;
+static unsigned long *stale_map[NR_CPUS];
+static struct mm_struct **context_mm;
 static spinlock_t context_lock = SPIN_LOCK_UNLOCKED;
 
+#define CTX_MAP_SIZE	\
+	(sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
+
+
 /* Steal a context from a task that has one at the moment.
  *
  * This is used when we are running out of available PID numbers
@@ -98,7 +74,7 @@ static unsigned int steal_context_smp(unsigned int id)
 	unsigned int cpu, max;
 
  again:
-	max = LAST_CONTEXT - FIRST_CONTEXT;
+	max = last_context - first_context;
 
 	/* Attempt to free next_context first and then loop until we manage */
 	while (max--) {
@@ -110,8 +86,8 @@ static unsigned int steal_context_smp(unsigned int id)
 		 */
 		if (mm->context.active) {
 			id++;
-			if (id > LAST_CONTEXT)
-				id = FIRST_CONTEXT;
+			if (id > last_context)
+				id = first_context;
 			continue;
 		}
 		pr_debug("[%d] steal context %d from mm @%p\n",
@@ -169,7 +145,7 @@ static void context_check_map(void)
 	unsigned int id, nrf, nact;
 
 	nrf = nact = 0;
-	for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
+	for (id = first_context; id <= last_context; id++) {
 		int used = test_bit(id, context_map);
 		if (!used)
 			nrf++;
@@ -187,6 +163,8 @@ static void context_check_map(void)
 	if (nact > num_online_cpus())
 		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
 		       nact, num_online_cpus());
+	if (first_context > 0 && !test_bit(0, context_map))
+		pr_err("MMU: Context 0 has been freed !!!\n");
 }
 #else
 static void context_check_map(void) { }
@@ -209,6 +187,10 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 	/* Mark us active and the previous one not anymore */
 	next->context.active++;
 	if (prev) {
+#ifndef DEBUG_STEAL_ONLY
+		pr_debug(" old context %p active was: %d\n",
+			 prev, prev->context.active);
+#endif
 		WARN_ON(prev->context.active < 1);
 		prev->context.active--;
 	}
@@ -221,8 +203,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 
 	/* We really don't have a context, let's try to acquire one */
 	id = next_context;
-	if (id > LAST_CONTEXT)
-		id = FIRST_CONTEXT;
+	if (id > last_context)
+		id = first_context;
 	map = context_map;
 
 	/* No more free contexts, let's try to steal one */
@@ -240,9 +222,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 
 	/* We know there's at least one free context, try to find it */
 	while (__test_and_set_bit(id, map)) {
-		id = find_next_zero_bit(map, LAST_CONTEXT+1, id);
-		if (id > LAST_CONTEXT)
-			id = FIRST_CONTEXT;
+		id = find_next_zero_bit(map, last_context+1, id);
+		if (id > last_context)
+			id = first_context;
 	}
  stolen:
 	next_context = id + 1;
@@ -311,6 +293,42 @@ void destroy_context(struct mm_struct *mm)
 	spin_unlock(&context_lock);
 }
 
+#ifdef CONFIG_SMP
+
+static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
+					    unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned int)(long)hcpu;
+
+	/* We don't touch CPU 0 map, it's allocated at aboot and kept
+	 * around forever
+	 */
+	if (cpu == 0)
+		return NOTIFY_OK;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		pr_debug("MMU: Allocating stale context map for CPU %d\n", cpu);
+		stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		pr_debug("MMU: Freeing stale context map for CPU %d\n", cpu);
+		kfree(stale_map[cpu]);
+		stale_map[cpu] = NULL;
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata mmu_context_cpu_nb = {
+	.notifier_call	= mmu_context_cpu_notify,
+};
+
+#endif /* CONFIG_SMP */
 
 /*
  * Initialize the context management stuff.
@@ -323,14 +341,57 @@ void __init mmu_context_init(void)
 	 */
 	init_mm.context.active = NR_CPUS;
 
+	/*
+	 *   The MPC8xx has only 16 contexts.  We rotate through them on each
+	 * task switch.  A better way would be to keep track of tasks that
+	 * own contexts, and implement an LRU usage.  That way very active
+	 * tasks don't always have to pay the TLB reload overhead.  The
+	 * kernel pages are mapped shared, so the kernel can run on behalf
+	 * of any task that makes a kernel entry.  Shared does not mean they
+	 * are not protected, just that the ASID comparison is not performed.
+	 *      -- Dan
+	 *
+	 * The IBM4xx has 256 contexts, so we can just rotate through these
+	 * as a way of "switching" contexts.  If the TID of the TLB is zero,
+	 * the PID/TID comparison is disabled, so we can use a TID of zero
+	 * to represent all kernel pages as shared among all contexts.
+	 * 	-- Dan
+	 */
+	if (mmu_has_feature(MMU_FTR_TYPE_8xx)) {
+		first_context = 0;
+		last_context = 15;
+	} else {
+		first_context = 1;
+		last_context = 255;
+	}
+
+#ifdef DEBUG_CLAMP_LAST_CONTEXT
+	last_context = DEBUG_CLAMP_LAST_CONTEXT;
+#endif
+	/*
+	 * Allocate the maps used by context management
+	 */
+	context_map = alloc_bootmem(CTX_MAP_SIZE);
+	context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1));
+	stale_map[0] = alloc_bootmem(CTX_MAP_SIZE);
+
+#ifdef CONFIG_SMP
+	register_cpu_notifier(&mmu_context_cpu_nb);
+#endif
+
+	printk(KERN_INFO
+	       "MMU: Allocated %d bytes of context maps for %d contexts\n",
+	       2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)),
+	       last_context - first_context + 1);
+
 	/*
 	 * Some processors have too few contexts to reserve one for
 	 * init_mm, and require using context 0 for a normal task.
 	 * Other processors reserve the use of context zero for the kernel.
-	 * This code assumes FIRST_CONTEXT < 32.
+	 * This code assumes first_context < 32.
 	 */
-	context_map[0] = (1 << FIRST_CONTEXT) - 1;
-	next_context = FIRST_CONTEXT;
-	nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
+	context_map[0] = (1 << first_context) - 1;
+	next_context = first_context;
+	nr_free_contexts = last_context - first_context + 1;
 }
 
-- 
cgit v1.2.3


From 64b3d0e8122b422e879b23d42f9e0e8efbbf9744 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 18 Dec 2008 19:13:51 +0000
Subject: powerpc/mm: Rework usage of _PAGE_COHERENT/NO_CACHE/GUARDED

Currently, we never set _PAGE_COHERENT in the PTEs, we just OR it in
in the hash code based on some CPU feature bit.  We also manipulate
_PAGE_NO_CACHE and _PAGE_GUARDED by hand in all sorts of places.

This changes the logic so that instead, the PTE now contains
_PAGE_COHERENT for all normal RAM pages thay have I = 0 on platforms
that need it.  The hash code clears it if the feature bit is not set.

It also adds some clean accessors to setup various valid combinations
of access flags and change various bits of code to use them instead.

This should help having the PTE actually containing the bit
combinations that we really want.

I also removed _PAGE_GUARDED from _PAGE_BASE on 44x and instead
set it explicitely from the TLB miss.  I will ultimately remove it
completely as it appears that it might not be needed after all
but in the meantime, having it in the TLB miss makes things a
lot easier.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/pgtable-ppc32.h | 42 +++++++++++++-------------------
 arch/powerpc/include/asm/pgtable-ppc64.h | 13 ----------
 arch/powerpc/include/asm/pgtable.h       | 26 ++++++++++++++++++++
 arch/powerpc/kernel/head_44x.S           |  1 +
 arch/powerpc/kernel/pci-common.c         | 24 +++++++-----------
 arch/powerpc/mm/hash_low_32.S            |  4 +--
 arch/powerpc/mm/mem.c                    |  4 +--
 arch/powerpc/platforms/cell/spufs/file.c | 27 +++++++-------------
 drivers/video/controlfb.c                |  4 +--
 9 files changed, 68 insertions(+), 77 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index 6ab7c67cb5ab..f69a4d977729 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -228,9 +228,10 @@ extern int icache_44x_need_flush;
  *   - FILE *must* be in the bottom three bits because swap cache
  *     entries use the top 29 bits for TLB2.
  *
- *   - CACHE COHERENT bit (M) has no effect on PPC440 core, because it
- *     doesn't support SMP. So we can use this as software bit, like
- *     DIRTY.
+ *   - CACHE COHERENT bit (M) has no effect on original PPC440 cores,
+ *     because it doesn't support SMP. However, some later 460 variants
+ *     have -some- form of SMP support and so I keep the bit there for
+ *     future use
  *
  * With the PPC 44x Linux implementation, the 0-11th LSBs of the PTE are used
  * for memory protection related functions (see PTE structure in
@@ -436,20 +437,23 @@ extern int icache_44x_need_flush;
 			 _PAGE_USER | _PAGE_ACCESSED | \
 			 _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | \
 			 _PAGE_EXEC | _PAGE_HWEXEC)
+
 /*
- * Note: the _PAGE_COHERENT bit automatically gets set in the hardware
- * PTE if CONFIG_SMP is defined (hash_page does this); there is no need
- * to have it in the Linux PTE, and in fact the bit could be reused for
- * another purpose.  -- paulus.
+ * We define 2 sets of base prot bits, one for basic pages (ie,
+ * cacheable kernel and user pages) and one for non cacheable
+ * pages. We always set _PAGE_COHERENT when SMP is enabled or
+ * the processor might need it for DMA coherency.
  */
-
-#ifdef CONFIG_44x
-#define _PAGE_BASE	(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_GUARDED)
+#if defined(CONFIG_SMP) || defined(CONFIG_PPC_STD_MMU)
+#define _PAGE_BASE	(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_COHERENT)
 #else
 #define _PAGE_BASE	(_PAGE_PRESENT | _PAGE_ACCESSED)
 #endif
+#define _PAGE_BASE_NC	(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_NO_CACHE)
+
 #define _PAGE_WRENABLE	(_PAGE_RW | _PAGE_DIRTY | _PAGE_HWWRITE)
 #define _PAGE_KERNEL	(_PAGE_BASE | _PAGE_SHARED | _PAGE_WRENABLE)
+#define _PAGE_KERNEL_NC	(_PAGE_BASE_NC | _PAGE_SHARED | _PAGE_WRENABLE)
 
 #ifdef CONFIG_PPC_STD_MMU
 /* On standard PPC MMU, no user access implies kernel read/write access,
@@ -459,7 +463,7 @@ extern int icache_44x_need_flush;
 #define _PAGE_KERNEL_RO	(_PAGE_BASE | _PAGE_SHARED)
 #endif
 
-#define _PAGE_IO	(_PAGE_KERNEL | _PAGE_NO_CACHE | _PAGE_GUARDED)
+#define _PAGE_IO	(_PAGE_KERNEL_NC | _PAGE_GUARDED)
 #define _PAGE_RAM	(_PAGE_KERNEL | _PAGE_HWEXEC)
 
 #if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\
@@ -552,9 +556,6 @@ static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED;
 static inline int pte_file(pte_t pte)		{ return pte_val(pte) & _PAGE_FILE; }
 static inline int pte_special(pte_t pte)	{ return pte_val(pte) & _PAGE_SPECIAL; }
 
-static inline void pte_uncache(pte_t pte)       { pte_val(pte) |= _PAGE_NO_CACHE; }
-static inline void pte_cache(pte_t pte)         { pte_val(pte) &= ~_PAGE_NO_CACHE; }
-
 static inline pte_t pte_wrprotect(pte_t pte) {
 	pte_val(pte) &= ~(_PAGE_RW | _PAGE_HWWRITE); return pte; }
 static inline pte_t pte_mkclean(pte_t pte) {
@@ -693,10 +694,11 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 #endif
 }
 
+
 static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pte)
 {
-#if defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
+#if defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP) && defined(CONFIG_DEBUG_VM)
 	WARN_ON(pte_present(*ptep));
 #endif
 	__set_pte_at(mm, addr, ptep, pte);
@@ -760,16 +762,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry, int dirty)
 	__changed;							   \
 })
 
-/*
- * Macro to mark a page protection value as "uncacheable".
- */
-#define pgprot_noncached(prot)	(__pgprot(pgprot_val(prot) | _PAGE_NO_CACHE | _PAGE_GUARDED))
-
-struct file;
-extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
-				     unsigned long size, pgprot_t vma_prot);
-#define __HAVE_PHYS_MEM_ACCESS_PROT
-
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HASHPTE) == 0)
 
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 1f0a330f03f4..b0f18be81d9f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -245,9 +245,6 @@ static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED;}
 static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE;}
 static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; }
 
-static inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; }
-static inline void pte_cache(pte_t pte)   { pte_val(pte) &= ~_PAGE_NO_CACHE; }
-
 static inline pte_t pte_wrprotect(pte_t pte) {
 	pte_val(pte) &= ~(_PAGE_RW); return pte; }
 static inline pte_t pte_mkclean(pte_t pte) {
@@ -405,16 +402,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry, int dirty)
 	__changed;							   \
 })
 
-/*
- * Macro to mark a page protection value as "uncacheable".
- */
-#define pgprot_noncached(prot)	(__pgprot(pgprot_val(prot) | _PAGE_NO_CACHE | _PAGE_GUARDED))
-
-struct file;
-extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
-				     unsigned long size, pgprot_t vma_prot);
-#define __HAVE_PHYS_MEM_ACCESS_PROT
-
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
 
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index dbb8ca172e44..07f55e601696 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -16,6 +16,32 @@ struct mm_struct;
 #endif
 
 #ifndef __ASSEMBLY__
+
+/*
+ * Macro to mark a page protection value as "uncacheable".
+ */
+
+#define _PAGE_CACHE_CTL	(_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \
+			 _PAGE_WRITETHRU)
+
+#define pgprot_noncached(prot)	  (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_NO_CACHE | _PAGE_GUARDED))
+
+#define pgprot_noncached_wc(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_NO_CACHE))
+
+#define pgprot_cached(prot)       (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_COHERENT))
+
+#define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \
+				            _PAGE_COHERENT | _PAGE_WRITETHRU))
+
+
+struct file;
+extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+				     unsigned long size, pgprot_t vma_prot);
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index f3a1ea9d7fe4..26237357a88c 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -570,6 +570,7 @@ finish_tlb_load:
 	rlwimi	r10,r12,29,30,30		/* DIRTY -> SW position */
 	and	r11,r12,r10			/* Mask PTE bits to keep */
 	andi.	r10,r12,_PAGE_USER		/* User page ? */
+	ori	r11,r11,_PAGE_GUARDED		/* 440 errata, needs G set */
 	beq	1f				/* nope, leave U bits empty */
 	rlwimi	r11,r11,3,26,28			/* yes, copy S bits to U */
 1:	tlbwe	r11,r13,PPC44x_TLB_ATTRIB	/* Write ATTRIB */
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 1a32db331a5c..2538030954d8 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -370,13 +370,10 @@ static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
 	}
 
 	/* XXX would be nice to have a way to ask for write-through */
-	prot |= _PAGE_NO_CACHE;
 	if (write_combine)
-		prot &= ~_PAGE_GUARDED;
+		return pgprot_noncached_wc(prot);
 	else
-		prot |= _PAGE_GUARDED;
-
-	return __pgprot(prot);
+		return pgprot_noncached(prot);
 }
 
 /*
@@ -387,19 +384,17 @@ static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
 pgprot_t pci_phys_mem_access_prot(struct file *file,
 				  unsigned long pfn,
 				  unsigned long size,
-				  pgprot_t protection)
+				  pgprot_t prot)
 {
 	struct pci_dev *pdev = NULL;
 	struct resource *found = NULL;
-	unsigned long prot = pgprot_val(protection);
 	resource_size_t offset = ((resource_size_t)pfn) << PAGE_SHIFT;
 	int i;
 
 	if (page_is_ram(pfn))
-		return __pgprot(prot);
-
-	prot |= _PAGE_NO_CACHE | _PAGE_GUARDED;
+		return prot;
 
+	prot = pgprot_noncached(prot);
 	for_each_pci_dev(pdev) {
 		for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
 			struct resource *rp = &pdev->resource[i];
@@ -420,14 +415,14 @@ pgprot_t pci_phys_mem_access_prot(struct file *file,
 	}
 	if (found) {
 		if (found->flags & IORESOURCE_PREFETCH)
-			prot &= ~_PAGE_GUARDED;
+			prot = pgprot_noncached_wc(prot);
 		pci_dev_put(pdev);
 	}
 
 	pr_debug("PCI: Non-PCI map for %llx, prot: %lx\n",
-		 (unsigned long long)offset, prot);
+		 (unsigned long long)offset, pgprot_val(prot));
 
-	return __pgprot(prot);
+	return prot;
 }
 
 
@@ -583,8 +578,7 @@ int pci_mmap_legacy_page_range(struct pci_bus *bus,
 	pr_debug(" -> mapping phys %llx\n", (unsigned long long)offset);
 
 	vma->vm_pgoff = offset >> PAGE_SHIFT;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 			       vma->vm_end - vma->vm_start,
 			       vma->vm_page_prot);
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index c8eac22a8f00..28845604a10c 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -323,8 +323,8 @@ _GLOBAL(create_hpte)
 	ori	r8,r8,0xe14		/* clear out reserved bits and M */
 	andc	r8,r5,r8		/* PP = user? (rw&dirty? 2: 3): 0 */
 BEGIN_FTR_SECTION
-	ori	r8,r8,_PAGE_COHERENT	/* set M (coherence required) */
-END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT)
+	rlwinm	r8,r8,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
+END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
 #ifdef CONFIG_PTE_64BIT
 	/* Put the XPN bits into the PTE */
 	rlwimi	r8,r10,8,20,22
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 8fee696fb795..53b06ebb3f2f 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -102,8 +102,8 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 		return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot);
 
 	if (!page_is_ram(pfn))
-		vma_prot = __pgprot(pgprot_val(vma_prot)
-				    | _PAGE_GUARDED | _PAGE_NO_CACHE);
+		vma_prot = pgprot_noncached(vma_prot);
+
 	return vma_prot;
 }
 EXPORT_SYMBOL(phys_mem_access_prot);
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 1b26071a86ca..7106b63d401b 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -273,12 +273,10 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return VM_FAULT_NOPAGE;
 
 	if (ctx->state == SPU_STATE_SAVED) {
-		vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-							& ~_PAGE_NO_CACHE);
+		vma->vm_page_prot = pgprot_cached(vma->vm_page_prot);
 		pfn = vmalloc_to_pfn(ctx->csa.lscsa->ls + offset);
 	} else {
-		vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-					     | _PAGE_NO_CACHE);
+		vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
 		pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT;
 	}
 	vm_insert_pfn(vma, address, pfn);
@@ -338,8 +336,7 @@ static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE);
+	vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_mem_mmap_vmops;
 	return 0;
@@ -452,8 +449,7 @@ static int spufs_cntl_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_cntl_mmap_vmops;
 	return 0;
@@ -1155,8 +1151,7 @@ static int spufs_signal1_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_signal1_mmap_vmops;
 	return 0;
@@ -1292,8 +1287,7 @@ static int spufs_signal2_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_signal2_mmap_vmops;
 	return 0;
@@ -1414,8 +1408,7 @@ static int spufs_mss_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_mss_mmap_vmops;
 	return 0;
@@ -1476,8 +1469,7 @@ static int spufs_psmap_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_psmap_mmap_vmops;
 	return 0;
@@ -1536,8 +1528,7 @@ static int spufs_mfc_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_PFNMAP;
-	vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
-				     | _PAGE_NO_CACHE | _PAGE_GUARDED);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	vma->vm_ops = &spufs_mfc_mmap_vmops;
 	return 0;
diff --git a/drivers/video/controlfb.c b/drivers/video/controlfb.c
index b0be7eac32d8..49fcbe8f18ac 100644
--- a/drivers/video/controlfb.c
+++ b/drivers/video/controlfb.c
@@ -298,10 +298,10 @@ static int controlfb_mmap(struct fb_info *info,
                        return -EINVAL;
                start = info->fix.mmio_start;
                len = PAGE_ALIGN((start & ~PAGE_MASK)+info->fix.mmio_len);
-               pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE|_PAGE_GUARDED;
+	       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
        } else {
                /* framebuffer */
-               pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
+	       vma->vm_page_prot = pgprot_cached_wthru(vma->vm_page_prot);
        }
        start &= PAGE_MASK;
        if ((vma->vm_end - vma->vm_start + off) > len)
-- 
cgit v1.2.3


From a14953597b771f793ce32529d7b8b04fdedca3ef Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 21 Dec 2008 02:54:25 -0700
Subject: powerpc: Fix missing 'blr' in _tlbia()

Rework to MMU code dropped a much missed 'blr' instruction.

Brown-Paper-Bag-Worn-By: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/mm/hash_low_32.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 28845604a10c..67850ec9feb3 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -709,3 +709,4 @@ _GLOBAL(_tlbia)
 	tlbia
 	sync
 #endif /* CONFIG_SMP */
+	blr
-- 
cgit v1.2.3


From 01695a9687e5a8d78589605037cc7828a5b67ac9 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Wed, 17 Dec 2008 10:09:10 +0000
Subject: powerpc/32: Allow __ioremap on RAM addresses for kdump kernel

While for debugging it is good to catch bogus users of ioremap, though
for kdump support it is more convenient to use __ioremap for
copy_oldmem_page() (exactly as we do for PPC64 currently).

Note that copy_oldmem_page() calls __ioremap with flags set to '0',
so it should be safe with the regard to the caches.

The other option is to use kmap_atomic_pfn()[1], but it will not work
for kernels compiled without HIGHMEM.

That is, on a board with 256MB RAM and crashkernel=64M@32M case, the
!HIGHMEM capturing kernel maps 0-96M range, which does not include all
the memory needed to capture the dump. And, obviously, accessing
anything upper than 96M will cause faults.

[1] http://ozlabs.org/pipermail/linuxppc-dev/2007-November/046747.html

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/pgtable_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 341472440137..cd5609759d44 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -173,6 +173,7 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 	if (p < 16*1024*1024)
 		p += _ISA_MEM_BASE;
 
+#ifndef CONFIG_CRASH_DUMP
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using.
 	 * mem_init() sets high_memory so only do the check after that.
@@ -182,6 +183,7 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 		       (unsigned long long)p, __builtin_return_address(0));
 		return NULL;
 	}
+#endif
 
 	if (size == 0)
 		return NULL;
-- 
cgit v1.2.3


From ccdcef72c249c289898b164eada89a61855b9287 Mon Sep 17 00:00:00 2001
From: Dale Farnsworth <dale@farnsworth.org>
Date: Wed, 17 Dec 2008 10:09:13 +0000
Subject: powerpc/32: Add the ability for a classic ppc kernel to be loaded at
 32M

Add the ability for a classic ppc kernel to be loaded at an address
of 32MB.  This done by fixing a few places that assume we are loaded
at address 0, and by changing several uses of KERNELBASE to use
PAGE_OFFSET, instead.

Signed-off-by: Dale Farnsworth <dale@farnsworth.org>
Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/ppc_asm.h |  4 ++--
 arch/powerpc/kernel/head_32.S      | 11 ++++++-----
 arch/powerpc/mm/init_32.c          |  2 +-
 arch/powerpc/mm/pgtable_32.c       |  4 ++--
 arch/powerpc/mm/ppc_mmu_32.c       |  8 ++++----
 5 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index c4a029ccb4d3..1a0d628eb114 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -425,14 +425,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
 #define fromreal(rd)	tovirt(rd,rd)
 
 #define tophys(rd,rs)				\
-0:	addis	rd,rs,-KERNELBASE@h;		\
+0:	addis	rd,rs,-PAGE_OFFSET@h;		\
 	.section ".vtop_fixup","aw";		\
 	.align  1;				\
 	.long   0b;				\
 	.previous
 
 #define tovirt(rd,rs)				\
-0:	addis	rd,rs,KERNELBASE@h;		\
+0:	addis	rd,rs,PAGE_OFFSET@h;		\
 	.section ".ptov_fixup","aw";		\
 	.align  1;				\
 	.long   0b;				\
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 266061924654..a1c4cfd25ded 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -183,7 +183,8 @@ __after_mmu_off:
 	bl	reloc_offset
 	mr	r26,r3
 	addis	r4,r3,KERNELBASE@h	/* current address of _start */
-	cmpwi	0,r4,0			/* are we already running at 0? */
+	lis	r5,PHYSICAL_START@h
+	cmplw	0,r4,r5			/* already running at PHYSICAL_START? */
 	bne	relocate_kernel
 /*
  * we now have the 1st 16M of ram mapped with the bats.
@@ -811,13 +812,13 @@ giveup_altivec:
 
 /*
  * This code is jumped to from the startup code to copy
- * the kernel image to physical address 0.
+ * the kernel image to physical address PHYSICAL_START.
  */
 relocate_kernel:
 	addis	r9,r26,klimit@ha	/* fetch klimit */
 	lwz	r25,klimit@l(r9)
 	addis	r25,r25,-KERNELBASE@h
-	li	r3,0			/* Destination base address */
+	lis	r3,PHYSICAL_START@h	/* Destination base address */
 	li	r6,0			/* Destination offset */
 	li	r5,0x4000		/* # bytes of memory to copy */
 	bl	copy_and_flush		/* copy the first 0x4000 bytes */
@@ -1188,11 +1189,11 @@ mmu_off:
 
 /*
  * Use the first pair of BAT registers to map the 1st 16MB
- * of RAM to KERNELBASE.  From this point on we can't safely
+ * of RAM to PAGE_OFFSET.  From this point on we can't safely
  * call OF any more.
  */
 initial_bats:
-	lis	r11,KERNELBASE@h
+	lis	r11,PAGE_OFFSET@h
 	mfspr	r9,SPRN_PVR
 	rlwinm	r9,r9,16,16,31		/* r9 = 1 for 601, 4 for 604 */
 	cmpwi	0,r9,1
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 578294c3b1ce..666a5e8a5be1 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -48,7 +48,7 @@
 
 #if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
 /* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */
-#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - KERNELBASE))
+#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET))
 #error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
 #endif
 #endif
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index cd5609759d44..8cba46fc9e3b 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -269,7 +269,7 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
 }
 
 /*
- * Map in a big chunk of physical memory starting at KERNELBASE.
+ * Map in a big chunk of physical memory starting at PAGE_OFFSET.
  */
 void __init mapin_ram(void)
 {
@@ -278,7 +278,7 @@ void __init mapin_ram(void)
 	int ktext;
 
 	s = mmu_mapin_ram();
-	v = KERNELBASE + s;
+	v = PAGE_OFFSET + s;
 	p = memstart_addr + s;
 	for (; s < total_lowmem; s += PAGE_SIZE) {
 		ktext = ((char *) v >= _stext && (char *) v < etext);
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 9d97db7b7cf7..45d925360b89 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -95,16 +95,16 @@ unsigned long __init mmu_mapin_ram(void)
 			break;
 	}
 
-	setbat(2, KERNELBASE, 0, bl, _PAGE_RAM);
-	done = (unsigned long)bat_addrs[2].limit - KERNELBASE + 1;
+	setbat(2, PAGE_OFFSET, 0, bl, _PAGE_RAM);
+	done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1;
 	if ((done < tot) && !bat_addrs[3].limit) {
 		/* use BAT3 to cover a bit more */
 		tot -= done;
 		for (bl = 128<<10; bl < max_size; bl <<= 1)
 			if (bl * 2 > tot)
 				break;
-		setbat(3, KERNELBASE+done, done, bl, _PAGE_RAM);
-		done = (unsigned long)bat_addrs[3].limit - KERNELBASE + 1;
+		setbat(3, PAGE_OFFSET+done, done, bl, _PAGE_RAM);
+		done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1;
 	}
 
 	return done;
-- 
cgit v1.2.3


From ca9153a3a2a7556d091dfe080e42b0e67881fff6 Mon Sep 17 00:00:00 2001
From: Ilya Yanok <yanok@emcraft.com>
Date: Thu, 11 Dec 2008 04:55:41 +0300
Subject: powerpc/44x: Support 16K/64K base page sizes on 44x

This adds support for 16k and 64k page sizes on PowerPC 44x processors.

The PGDIR table is much smaller than a page when using 16k or 64k
pages (512 and 32 bytes respectively) so we allocate the PGDIR with
kzalloc() instead of __get_free_pages().

One PTE table covers rather a large memory area when using 16k or 64k
pages (32MB or 512MB respectively), so we can easily put FIXMAP and
PKMAP in the area covered by one PTE table.

Signed-off-by: Yuri Tikhonov <yur@emcraft.com>
Signed-off-by: Vladimir Panfilov <pvr@emcraft.com>
Signed-off-by: Ilya Yanok <yanok@emcraft.com>
Acked-by: Josh Boyer <jwboyer@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/Kconfig                   | 58 ++++++++++++++++++++++++++--------
 arch/powerpc/include/asm/highmem.h     | 19 +++++++++--
 arch/powerpc/include/asm/mmu-44x.h     | 17 ++++++++++
 arch/powerpc/include/asm/page.h        | 13 +++++---
 arch/powerpc/include/asm/page_32.h     |  7 ++--
 arch/powerpc/kernel/asm-offsets.c      |  4 +++
 arch/powerpc/kernel/head_44x.S         | 23 ++++++++------
 arch/powerpc/kernel/misc_32.S          | 12 +++----
 arch/powerpc/mm/pgtable_32.c           | 23 ++++++++------
 arch/powerpc/platforms/Kconfig.cputype |  2 +-
 10 files changed, 130 insertions(+), 48 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f7f5448f863d..1af22579e3d4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -405,23 +405,53 @@ config PPC_HAS_HASH_64K
 	depends on PPC64
 	default n
 
-config PPC_64K_PAGES
-	bool "64k page size"
-	depends on PPC64
-	select PPC_HAS_HASH_64K
+choice
+	prompt "Page size"
+	default PPC_4K_PAGES
 	help
-	  This option changes the kernel logical page size to 64k. On machines
-	  without processor support for 64k pages, the kernel will simulate
-	  them by loading each individual 4k page on demand transparently,
-	  while on hardware with such support, it will be used to map
-	  normal application pages.
+	  Select the kernel logical page size. Increasing the page size
+	  will reduce software overhead at each page boundary, allow
+	  hardware prefetch mechanisms to be more effective, and allow
+	  larger dma transfers increasing IO efficiency and reducing
+	  overhead. However the utilization of memory will increase.
+	  For example, each cached file will using a multiple of the
+	  page size to hold its contents and the difference between the
+	  end of file and the end of page is wasted.
+
+	  Some dedicated systems, such as software raid serving with
+	  accelerated calculations, have shown significant increases.
+
+	  If you configure a 64 bit kernel for 64k pages but the
+	  processor does not support them, then the kernel will simulate
+	  them with 4k pages, loading them on demand, but with the
+	  reduced software overhead and larger internal fragmentation.
+	  For the 32 bit kernel, a large page option will not be offered
+	  unless it is supported by the configured processor.
+
+	  If unsure, choose 4K_PAGES.
+
+config PPC_4K_PAGES
+	bool "4k page size"
+
+config PPC_16K_PAGES
+	bool "16k page size" if 44x
+
+config PPC_64K_PAGES
+	bool "64k page size" if 44x || PPC_STD_MMU_64
+	select PPC_HAS_HASH_64K if PPC_STD_MMU_64
+
+endchoice
 
 config FORCE_MAX_ZONEORDER
 	int "Maximum zone order"
-	range 9 64 if PPC_64K_PAGES
-	default "9" if PPC_64K_PAGES
-	range 13 64 if PPC64 && !PPC_64K_PAGES
-	default "13" if PPC64 && !PPC_64K_PAGES
+	range 9 64 if PPC_STD_MMU_64 && PPC_64K_PAGES
+	default "9" if PPC_STD_MMU_64 && PPC_64K_PAGES
+	range 13 64 if PPC_STD_MMU_64 && !PPC_64K_PAGES
+	default "13" if PPC_STD_MMU_64 && !PPC_64K_PAGES
+	range 9 64 if PPC_STD_MMU_32 && PPC_16K_PAGES
+	default "9" if PPC_STD_MMU_32 && PPC_16K_PAGES
+	range 7 64 if PPC_STD_MMU_32 && PPC_64K_PAGES
+	default "7" if PPC_STD_MMU_32 && PPC_64K_PAGES
 	range 11 64
 	default "11"
 	help
@@ -441,7 +471,7 @@ config FORCE_MAX_ZONEORDER
 
 config PPC_SUBPAGE_PROT
 	bool "Support setting protections for 4k subpages"
-	depends on PPC_64K_PAGES
+	depends on PPC_STD_MMU_64 && PPC_64K_PAGES
 	help
 	  This option adds support for a system call to allow user programs
 	  to set access permissions (read/write, readonly, or no access)
diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h
index fd97e501aa6a..04e4a620952e 100644
--- a/arch/powerpc/include/asm/highmem.h
+++ b/arch/powerpc/include/asm/highmem.h
@@ -38,9 +38,24 @@ extern pte_t *pkmap_page_table;
  * easily, subsequent pte tables have to be allocated in one physical
  * chunk of RAM.
  */
-#define LAST_PKMAP 	(1 << PTE_SHIFT)
-#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+/*
+ * We use one full pte table with 4K pages. And with 16K/64K pages pte
+ * table covers enough memory (32MB and 512MB resp.) that both FIXMAP
+ * and PKMAP can be placed in single pte table. We use 1024 pages for
+ * PKMAP in case of 16K/64K pages.
+ */
+#ifdef CONFIG_PPC_4K_PAGES
+#define PKMAP_ORDER	PTE_SHIFT
+#else
+#define PKMAP_ORDER	10
+#endif
+#define LAST_PKMAP	(1 << PKMAP_ORDER)
+#ifndef CONFIG_PPC_4K_PAGES
+#define PKMAP_BASE	(FIXADDR_START - PAGE_SIZE*(LAST_PKMAP + 1))
+#else
 #define PKMAP_BASE	((FIXADDR_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
+#endif
+#define LAST_PKMAP_MASK	(LAST_PKMAP-1)
 #define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
 #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
 
diff --git a/arch/powerpc/include/asm/mmu-44x.h b/arch/powerpc/include/asm/mmu-44x.h
index b21af32ac6d6..8a97cfb08b7e 100644
--- a/arch/powerpc/include/asm/mmu-44x.h
+++ b/arch/powerpc/include/asm/mmu-44x.h
@@ -4,6 +4,8 @@
  * PPC440 support
  */
 
+#include <asm/page.h>
+
 #define PPC44x_MMUCR_TID	0x000000ff
 #define PPC44x_MMUCR_STS	0x00010000
 
@@ -74,4 +76,19 @@ typedef struct {
 /* Size of the TLBs used for pinning in lowmem */
 #define PPC_PIN_SIZE	(1 << 28)	/* 256M */
 
+#if (PAGE_SHIFT == 12)
+#define PPC44x_TLBE_SIZE	PPC44x_TLB_4K
+#elif (PAGE_SHIFT == 14)
+#define PPC44x_TLBE_SIZE	PPC44x_TLB_16K
+#elif (PAGE_SHIFT == 16)
+#define PPC44x_TLBE_SIZE	PPC44x_TLB_64K
+#else
+#error "Unsupported PAGE_SIZE"
+#endif
+
+#define PPC44x_PGD_OFF_SHIFT	(32 - PGDIR_SHIFT + PGD_T_LOG2)
+#define PPC44x_PGD_OFF_MASK_BIT	(PGDIR_SHIFT - PGD_T_LOG2)
+#define PPC44x_PTE_ADD_SHIFT	(32 - PGDIR_SHIFT + PTE_SHIFT + PTE_T_LOG2)
+#define PPC44x_PTE_ADD_MASK_BIT	(32 - PTE_T_LOG2 - PTE_SHIFT)
+
 #endif /* _ASM_POWERPC_MMU_44X_H_ */
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index c0b8d4a29a91..197d569f5bd3 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -19,12 +19,15 @@
 #include <asm/kdump.h>
 
 /*
- * On PPC32 page size is 4K. For PPC64 we support either 4K or 64K software
+ * On regular PPC32 page size is 4K (but we support 4K/16K/64K pages
+ * on PPC44x). For PPC64 we support either 4K or 64K software
  * page size. When using 64K pages however, whether we are really supporting
  * 64K pages in HW or not is irrelevant to those definitions.
  */
-#ifdef CONFIG_PPC_64K_PAGES
+#if defined(CONFIG_PPC_64K_PAGES)
 #define PAGE_SHIFT		16
+#elif defined(CONFIG_PPC_16K_PAGES)
+#define PAGE_SHIFT		14
 #else
 #define PAGE_SHIFT		12
 #endif
@@ -151,7 +154,7 @@ typedef struct { pte_basic_t pte; } pte_t;
 /* 64k pages additionally define a bigger "real PTE" type that gathers
  * the "second half" part of the PTE for pseudo 64k pages
  */
-#ifdef CONFIG_PPC_64K_PAGES
+#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
 typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
 #else
 typedef struct { pte_t pte; } real_pte_t;
@@ -191,10 +194,10 @@ typedef pte_basic_t pte_t;
 #define pte_val(x)	(x)
 #define __pte(x)	(x)
 
-#ifdef CONFIG_PPC_64K_PAGES
+#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
 typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
 #else
-typedef unsigned long real_pte_t;
+typedef pte_t real_pte_t;
 #endif
 
 
diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h
index d77072a32cc6..1458d9500381 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -19,6 +19,8 @@
 #define PTE_FLAGS_OFFSET	0
 #endif
 
+#define PTE_SHIFT	(PAGE_SHIFT - PTE_T_LOG2)	/* full page */
+
 #ifndef __ASSEMBLY__
 /*
  * The basic type of a PTE - 64 bits for those CPUs with > 32 bit
@@ -26,10 +28,8 @@
  */
 #ifdef CONFIG_PTE_64BIT
 typedef unsigned long long pte_basic_t;
-#define PTE_SHIFT	(PAGE_SHIFT - 3)	/* 512 ptes per page */
 #else
 typedef unsigned long pte_basic_t;
-#define PTE_SHIFT	(PAGE_SHIFT - 2)	/* 1024 ptes per page */
 #endif
 
 struct page;
@@ -39,6 +39,9 @@ extern void copy_page(void *to, void *from);
 
 #include <asm-generic/page.h>
 
+#define PGD_T_LOG2	(__builtin_ffs(sizeof(pgd_t)) - 1)
+#define PTE_T_LOG2	(__builtin_ffs(sizeof(pte_t)) - 1)
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PAGE_32_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index c05ab1d3e620..661d07d2146b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -380,6 +380,10 @@ int main(void)
 	DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
 	DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
 #endif
+#ifdef CONFIG_44x
+	DEFINE(PGD_T_LOG2, PGD_T_LOG2);
+	DEFINE(PTE_T_LOG2, PTE_T_LOG2);
+#endif
 
 	return 0;
 }
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index bd4fe9e7278b..b56fecc93a16 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -402,12 +402,14 @@ interrupt_base:
 	rlwimi	r13,r12,10,30,30
 
 	/* Load the PTE */
-	rlwinm 	r12, r10, 13, 19, 29	/* Compute pgdir/pmd offset */
+	/* Compute pgdir/pmd offset */
+	rlwinm  r12, r10, PPC44x_PGD_OFF_SHIFT, PPC44x_PGD_OFF_MASK_BIT, 29
 	lwzx	r11, r12, r11		/* Get pgd/pmd entry */
 	rlwinm.	r12, r11, 0, 0, 20	/* Extract pt base address */
 	beq	2f			/* Bail if no table */
 
-	rlwimi	r12, r10, 23, 20, 28	/* Compute pte address */
+	/* Compute pte address */
+	rlwimi  r12, r10, PPC44x_PTE_ADD_SHIFT, PPC44x_PTE_ADD_MASK_BIT, 28
 	lwz	r11, 0(r12)		/* Get high word of pte entry */
 	lwz	r12, 4(r12)		/* Get low word of pte entry */
 
@@ -496,12 +498,14 @@ tlb_44x_patch_hwater_D:
 	/* Make up the required permissions */
 	li	r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_HWEXEC
 
-	rlwinm	r12, r10, 13, 19, 29	/* Compute pgdir/pmd offset */
+	/* Compute pgdir/pmd offset */
+	rlwinm 	r12, r10, PPC44x_PGD_OFF_SHIFT, PPC44x_PGD_OFF_MASK_BIT, 29
 	lwzx	r11, r12, r11		/* Get pgd/pmd entry */
 	rlwinm.	r12, r11, 0, 0, 20	/* Extract pt base address */
 	beq	2f			/* Bail if no table */
 
-	rlwimi	r12, r10, 23, 20, 28	/* Compute pte address */
+	/* Compute pte address */
+	rlwimi	r12, r10, PPC44x_PTE_ADD_SHIFT, PPC44x_PTE_ADD_MASK_BIT, 28
 	lwz	r11, 0(r12)		/* Get high word of pte entry */
 	lwz	r12, 4(r12)		/* Get low word of pte entry */
 
@@ -565,15 +569,16 @@ tlb_44x_patch_hwater_I:
  */
 finish_tlb_load:
 	/* Combine RPN & ERPN an write WS 0 */
-	rlwimi	r11,r12,0,0,19
+	rlwimi	r11,r12,0,0,31-PAGE_SHIFT
 	tlbwe	r11,r13,PPC44x_TLB_XLAT
 
 	/*
 	 * Create WS1. This is the faulting address (EPN),
 	 * page size, and valid flag.
 	 */
-	li	r11,PPC44x_TLB_VALID | PPC44x_TLB_4K
-	rlwimi	r10,r11,0,20,31			/* Insert valid and page size*/
+	li	r11,PPC44x_TLB_VALID | PPC44x_TLBE_SIZE
+	/* Insert valid and page size */
+	rlwimi	r10,r11,0,PPC44x_PTE_ADD_MASK_BIT,31
 	tlbwe	r10,r13,PPC44x_TLB_PAGEID	/* Write PAGEID */
 
 	/* And WS 2 */
@@ -645,12 +650,12 @@ _GLOBAL(set_context)
  * goes at the beginning of the data segment, which is page-aligned.
  */
 	.data
-	.align	12
+	.align	PAGE_SHIFT
 	.globl	sdata
 sdata:
 	.globl	empty_zero_page
 empty_zero_page:
-	.space	4096
+	.space	PAGE_SIZE
 
 /*
  * To support >32-bit physical addresses, we use an 8KB pgdir.
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index ae0d084b6a24..15f28e0de78d 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -426,8 +426,8 @@ _GLOBAL(__flush_dcache_icache)
 BEGIN_FTR_SECTION
 	blr
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
-	rlwinm	r3,r3,0,0,19			/* Get page base address */
-	li	r4,4096/L1_CACHE_BYTES	/* Number of lines in a page */
+	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
+	li	r4,PAGE_SIZE/L1_CACHE_BYTES	/* Number of lines in a page */
 	mtctr	r4
 	mr	r6,r3
 0:	dcbst	0,r3				/* Write line to ram */
@@ -467,8 +467,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	rlwinm	r0,r10,0,28,26			/* clear DR */
 	mtmsr	r0
 	isync
-	rlwinm	r3,r3,0,0,19			/* Get page base address */
-	li	r4,4096/L1_CACHE_BYTES	/* Number of lines in a page */
+	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
+	li	r4,PAGE_SIZE/L1_CACHE_BYTES	/* Number of lines in a page */
 	mtctr	r4
 	mr	r6,r3
 0:	dcbst	0,r3				/* Write line to ram */
@@ -492,7 +492,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
  * void clear_pages(void *page, int order) ;
  */
 _GLOBAL(clear_pages)
-	li	r0,4096/L1_CACHE_BYTES
+	li	r0,PAGE_SIZE/L1_CACHE_BYTES
 	slw	r0,r0,r4
 	mtctr	r0
 #ifdef CONFIG_8xx
@@ -550,7 +550,7 @@ _GLOBAL(copy_page)
 	dcbt	r5,r4
 	li	r11,L1_CACHE_BYTES+4
 #endif /* MAX_COPY_PREFETCH */
-	li	r0,4096/L1_CACHE_BYTES - MAX_COPY_PREFETCH
+	li	r0,PAGE_SIZE/L1_CACHE_BYTES - MAX_COPY_PREFETCH
 	crclr	4*cr0+eq
 2:
 	mtctr	r0
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 8cba46fc9e3b..38ff35f2142a 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -68,24 +68,29 @@ extern unsigned long p_mapped_by_tlbcam(unsigned long pa);
 #define p_mapped_by_tlbcam(x)	(0UL)
 #endif /* HAVE_TLBCAM */
 
-#ifdef CONFIG_PTE_64BIT
-/* Some processors use an 8kB pgdir because they have 8-byte Linux PTEs. */
-#define PGDIR_ORDER	1
-#else
-#define PGDIR_ORDER	0
-#endif
+#define PGDIR_ORDER	(32 + PGD_T_LOG2 - PGDIR_SHIFT)
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *ret;
 
-	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
+	/* pgdir take page or two with 4K pages and a page fraction otherwise */
+#ifndef CONFIG_PPC_4K_PAGES
+	ret = (pgd_t *)kzalloc(1 << PGDIR_ORDER, GFP_KERNEL);
+#else
+	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
+			PGDIR_ORDER - PAGE_SHIFT);
+#endif
 	return ret;
 }
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	free_pages((unsigned long)pgd, PGDIR_ORDER);
+#ifndef CONFIG_PPC_4K_PAGES
+	kfree((void *)pgd);
+#else
+	free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT);
+#endif
 }
 
 __init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
@@ -385,7 +390,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
 static int fixmaps;
-unsigned long FIXADDR_TOP = 0xfffff000;
+unsigned long FIXADDR_TOP = (-PAGE_SIZE);
 EXPORT_SYMBOL(FIXADDR_TOP);
 
 void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index db61dafb924d..3d0c776f888d 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -212,7 +212,7 @@ config PPC_MMU_NOHASH
 
 config PPC_MM_SLICES
 	bool
-	default y if HUGETLB_PAGE || PPC_64K_PAGES
+	default y if HUGETLB_PAGE || (PPC_STD_MMU_64 && PPC_64K_PAGES)
 	default n
 
 config VIRT_CPU_ACCOUNTING
-- 
cgit v1.2.3