16 files changed, 3919 insertions, 71 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 35497deeb4b2..612bc4ec72b1 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -5,8 +5,14 @@
 obj-y				:= fault.o mem.o lmb.o
 obj-$(CONFIG_PPC32)		+= init_32.o pgtable_32.o mmu_context_32.o \
 				   tlb_32.o
-obj-$(CONFIG_PPC64)		+= init_64.o pgtable_64.o mmu_context_64.o
+hash-$(CONFIG_PPC_MULTIPLATFORM) := hash_native_64.o
+obj-$(CONFIG_PPC64)		+= init_64.o pgtable_64.o mmu_context_64.o \
+				   hash_utils_64.o hash_low_64.o tlb_64.o \
+				   slb_low.o slb.o stab.o mmap.o imalloc.o \
+				   $(hash-y)
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o hash_low_32.o
 obj-$(CONFIG_40x)		+= 4xx_mmu.o
 obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)		+= fsl_booke_mmu.o
+obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
+obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
new file mode 100644
index 000000000000..d6ed9102eeea
--- /dev/null
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -0,0 +1,288 @@
+/*
+ * ppc64 MMU hashtable management routines
+ *
+ * (c) Copyright IBM Corp. 2003
+ *
+ * Maintained by: Benjamin Herrenschmidt
+ *                <benh@kernel.crashing.org>
+ *
+ * This file is covered by the GNU Public Licence v2 as
+ * described in the kernel's COPYING file.
+ */
+
+#include <asm/reg.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+
+	.text
+
+/*
+ * Stackframe:
+ *		
+ *         +-> Back chain			(SP + 256)
+ *         |   General register save area	(SP + 112)
+ *         |   Parameter save area		(SP + 48)
+ *         |   TOC save area			(SP + 40)
+ *         |   link editor doubleword		(SP + 32)
+ *         |   compiler doubleword		(SP + 24)
+ *         |   LR save area			(SP + 16)
+ *         |   CR save area			(SP + 8)
+ * SP ---> +-- Back chain			(SP + 0)
+ */
+#define STACKFRAMESIZE	256
+
+/* Save parameters offsets */
+#define STK_PARM(i)	(STACKFRAMESIZE + 48 + ((i)-3)*8)
+
+/* Save non-volatile offsets */
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
+/*
+ * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid,
+ *		pte_t *ptep, unsigned long trap, int local)
+ *
+ * Adds a page to the hash table. This is the non-LPAR version for now
+ */
+
+_GLOBAL(__hash_page)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	/* Save all params that we need after a function call */
+	std	r6,STK_PARM(r6)(r1)
+	std	r8,STK_PARM(r8)(r1)
+	
+	/* Add _PAGE_PRESENT to access */
+	ori	r4,r4,_PAGE_PRESENT
+
+	/* Save non-volatile registers.
+	 * r31 will hold "old PTE"
+	 * r30 is "new PTE"
+	 * r29 is "va"
+	 * r28 is a hash value
+	 * r27 is hashtab mask (maybe dynamic patched instead ?)
+	 */
+	std	r27,STK_REG(r27)(r1)
+	std	r28,STK_REG(r28)(r1)
+	std	r29,STK_REG(r29)(r1)
+	std	r30,STK_REG(r30)(r1)
+	std	r31,STK_REG(r31)(r1)
+	
+	/* Step 1:
+	 *
+	 * Check permissions, atomically mark the linux PTE busy
+	 * and hashed.
+	 */ 
+1:
+	ldarx	r31,0,r6
+	/* Check access rights (access & ~(pte_val(*ptep))) */
+	andc.	r0,r4,r31
+	bne-	htab_wrong_access
+	/* Check if PTE is busy */
+	andi.	r0,r31,_PAGE_BUSY
+	/* If so, just bail out and refault if needed. Someone else
+	 * is changing this PTE anyway and might hash it.
+	 */
+	bne-	bail_ok
+	/* Prepare new PTE value (turn access RW into DIRTY, then
+	 * add BUSY,HASHPTE and ACCESSED)
+	 */
+	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
+	or	r30,r30,r31
+	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+	/* Write the linux PTE atomically (setting busy) */
+	stdcx.	r30,0,r6
+	bne-	1b
+	isync
+
+	/* Step 2:
+	 *
+	 * Insert/Update the HPTE in the hash table. At this point,
+	 * r4 (access) is re-useable, we use it for the new HPTE flags
+	 */
+
+	/* Calc va and put it in r29 */
+	rldicr	r29,r5,28,63-28
+	rldicl	r3,r3,0,36
+	or	r29,r3,r29
+
+	/* Calculate hash value for primary slot and store it in r28 */
+	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
+	rldicl	r0,r3,64-12,48		/* (ea >> 12) & 0xffff */
+	xor	r28,r5,r0
+
+	/* Convert linux PTE bits into HW equivalents */
+	andi.	r3,r30,0x1fe		/* Get basic set of flags */
+	xori	r3,r3,HW_NO_EXEC	/* _PAGE_EXEC -> NOEXEC */
+	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
+	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
+	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */
+	andc	r0,r30,r0		/* r0 = pte & ~r0 */
+	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
+
+	/* We eventually do the icache sync here (maybe inline that
+	 * code rather than call a C function...) 
+	 */
+BEGIN_FTR_SECTION
+	mr	r4,r30
+	mr	r5,r7
+	bl	.hash_page_do_lazy_icache
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
+
+	/* At this point, r3 contains new PP bits, save them in
+	 * place of "access" in the param area (sic)
+	 */
+	std	r3,STK_PARM(r4)(r1)
+
+	/* Get htab_hash_mask */
+	ld	r4,htab_hash_mask@got(2)
+	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
+
+	/* Check if we may already be in the hashtable, in this case, we
+	 * go to out-of-line code to try to modify the HPTE
+	 */
+	andi.	r0,r31,_PAGE_HASHPTE
+	bne	htab_modify_pte
+
+htab_insert_pte:
+	/* Clear hpte bits in new pte (we also clear BUSY btw) and
+	 * add _PAGE_HASHPTE
+	 */
+	lis	r0,_PAGE_HPTEFLAGS@h
+	ori	r0,r0,_PAGE_HPTEFLAGS@l
+	andc	r30,r30,r0
+	ori	r30,r30,_PAGE_HASHPTE
+
+	/* page number in r5 */
+	rldicl	r5,r31,64-PTE_SHIFT,PTE_SHIFT
+
+	/* Calculate primary group hash */
+	and	r0,r28,r27
+	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r7,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r6,0			/* no vflags */
+_GLOBAL(htab_call_hpte_insert1)
+	bl	.			/* Will be patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge	htab_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	htab_pte_insert_failure
+
+	/* Now try secondary slot */
+	
+	/* page number in r5 */
+	rldicl	r5,r31,64-PTE_SHIFT,PTE_SHIFT
+
+	/* Calculate secondary group hash */
+	andc	r0,r27,r28
+	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
+	
+	/* Call ppc_md.hpte_insert */
+	ld	r7,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r6,HPTE_V_SECONDARY@l	/* secondary slot */
+_GLOBAL(htab_call_hpte_insert2)
+	bl	.			/* Will be patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge+	htab_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	htab_pte_insert_failure
+
+	/* Both are full, we need to evict something */
+	mftb	r0
+	/* Pick a random group based on TB */
+	andi.	r0,r0,1
+	mr	r5,r28
+	bne	2f
+	not	r5,r5
+2:	and	r0,r5,r27
+	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */	
+	/* Call ppc_md.hpte_remove */
+_GLOBAL(htab_call_hpte_remove)
+	bl	.			/* Will be patched by htab_finish_init() */
+
+	/* Try all again */
+	b	htab_insert_pte	
+
+bail_ok:
+	li	r3,0
+	b	bail
+
+htab_pte_insert_ok:
+	/* Insert slot number & secondary bit in PTE */
+	rldimi	r30,r3,12,63-15
+		
+	/* Write out the PTE with a normal write
+	 * (maybe add eieio may be good still ?)
+	 */
+htab_write_out_pte:
+	ld	r6,STK_PARM(r6)(r1)
+	std	r30,0(r6)
+	li	r3, 0
+bail:
+	ld	r27,STK_REG(r27)(r1)
+	ld	r28,STK_REG(r28)(r1)
+	ld	r29,STK_REG(r29)(r1)
+	ld      r30,STK_REG(r30)(r1)
+	ld      r31,STK_REG(r31)(r1)
+	addi    r1,r1,STACKFRAMESIZE
+	ld      r0,16(r1)
+	mtlr    r0
+	blr
+
+htab_modify_pte:
+	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+	mr	r4,r3
+	rlwinm	r3,r31,32-12,29,31
+
+	/* Secondary group ? if yes, get a inverted hash value */
+	mr	r5,r28
+	andi.	r0,r31,_PAGE_SECONDARY
+	beq	1f
+	not	r5,r5
+1:
+	/* Calculate proper slot value for ppc_md.hpte_updatepp */
+	and	r0,r5,r27
+	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	add	r3,r0,r3	/* add slot idx */
+
+	/* Call ppc_md.hpte_updatepp */
+	mr	r5,r29			/* va */
+	li	r6,0			/* large is 0 */
+	ld	r7,STK_PARM(r8)(r1)	/* get "local" param */
+_GLOBAL(htab_call_hpte_updatepp)
+	bl	.			/* Will be patched by htab_finish_init() */
+
+	/* if we failed because typically the HPTE wasn't really here
+	 * we try an insertion. 
+	 */
+	cmpdi	0,r3,-1
+	beq-	htab_insert_pte
+
+	/* Clear the BUSY bit and Write out the PTE */
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	b	htab_write_out_pte
+
+htab_wrong_access:
+	/* Bail out clearing reservation */
+	stdcx.	r31,0,r6
+	li	r3,1
+	b	bail
+
+htab_pte_insert_failure:
+	/* Bail out restoring old PTE */
+	ld	r6,STK_PARM(r6)(r1)
+	std	r31,0(r6)
+	li	r3,-1
+	b	bail
+
+
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
new file mode 100644
index 000000000000..174d14576c28
--- /dev/null
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -0,0 +1,446 @@
+/*
+ * native hashtable management.
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/spinlock.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+
+#include <asm/abs_addr.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/cputable.h>
+
+#define HPTE_LOCK_BIT 3
+
+static DEFINE_SPINLOCK(native_tlbie_lock);
+
+static inline void native_lock_hpte(hpte_t *hptep)
+{
+	unsigned long *word = &hptep->v;
+
+	while (1) {
+		if (!test_and_set_bit(HPTE_LOCK_BIT, word))
+			break;
+		while(test_bit(HPTE_LOCK_BIT, word))
+			cpu_relax();
+	}
+}
+
+static inline void native_unlock_hpte(hpte_t *hptep)
+{
+	unsigned long *word = &hptep->v;
+
+	asm volatile("lwsync":::"memory");
+	clear_bit(HPTE_LOCK_BIT, word);
+}
+
+long native_hpte_insert(unsigned long hpte_group, unsigned long va,
+			unsigned long prpn, unsigned long vflags,
+			unsigned long rflags)
+{
+	hpte_t *hptep = htab_address + hpte_group;
+	unsigned long hpte_v, hpte_r;
+	int i;
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+		if (! (hptep->v & HPTE_V_VALID)) {
+			/* retry with lock held */
+			native_lock_hpte(hptep);
+			if (! (hptep->v & HPTE_V_VALID))
+				break;
+			native_unlock_hpte(hptep);
+		}
+
+		hptep++;
+	}
+
+	if (i == HPTES_PER_GROUP)
+		return -1;
+
+	hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
+	if (vflags & HPTE_V_LARGE)
+		va &= ~(1UL << HPTE_V_AVPN_SHIFT);
+	hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
+
+	hptep->r = hpte_r;
+	/* Guarantee the second dword is visible before the valid bit */
+	__asm__ __volatile__ ("eieio" : : : "memory");
+	/*
+	 * Now set the first dword including the valid bit
+	 * NOTE: this also unlocks the hpte
+	 */
+	hptep->v = hpte_v;
+
+	__asm__ __volatile__ ("ptesync" : : : "memory");
+
+	return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static long native_hpte_remove(unsigned long hpte_group)
+{
+	hpte_t *hptep;
+	int i;
+	int slot_offset;
+	unsigned long hpte_v;
+
+	/* pick a random entry to start at */
+	slot_offset = mftb() & 0x7;
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+		hptep = htab_address + hpte_group + slot_offset;
+		hpte_v = hptep->v;
+
+		if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
+			/* retry with lock held */
+			native_lock_hpte(hptep);
+			hpte_v = hptep->v;
+			if ((hpte_v & HPTE_V_VALID)
+			    && !(hpte_v & HPTE_V_BOLTED))
+				break;
+			native_unlock_hpte(hptep);
+		}
+
+		slot_offset++;
+		slot_offset &= 0x7;
+	}
+
+	if (i == HPTES_PER_GROUP)
+		return -1;
+
+	/* Invalidate the hpte. NOTE: this also unlocks it */
+	hptep->v = 0;
+
+	return i;
+}
+
+static inline void set_pp_bit(unsigned long pp, hpte_t *addr)
+{
+	unsigned long old;
+	unsigned long *p = &addr->r;
+
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%3\n\
+		rldimi	%0,%2,0,61\n\
+		stdcx.	%0,0,%3\n\
+		bne	1b"
+	: "=&r" (old), "=m" (*p)
+	: "r" (pp), "r" (p), "m" (*p)
+	: "cc");
+}
+
+/*
+ * Only works on small pages. Yes its ugly to have to check each slot in
+ * the group but we only use this during bootup.
+ */
+static long native_hpte_find(unsigned long vpn)
+{
+	hpte_t *hptep;
+	unsigned long hash;
+	unsigned long i, j;
+	long slot;
+	unsigned long hpte_v;
+
+	hash = hpt_hash(vpn, 0);
+
+	for (j = 0; j < 2; j++) {
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		for (i = 0; i < HPTES_PER_GROUP; i++) {
+			hptep = htab_address + slot;
+			hpte_v = hptep->v;
+
+			if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11))
+			    && (hpte_v & HPTE_V_VALID)
+			    && ( !!(hpte_v & HPTE_V_SECONDARY) == j)) {
+				/* HPTE matches */
+				if (j)
+					slot = -slot;
+				return slot;
+			}
+			++slot;
+		}
+		hash = ~hash;
+	}
+
+	return -1;
+}
+
+static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
+				 unsigned long va, int large, int local)
+{
+	hpte_t *hptep = htab_address + slot;
+	unsigned long hpte_v;
+	unsigned long avpn = va >> 23;
+	int ret = 0;
+
+	if (large)
+		avpn &= ~1;
+
+	native_lock_hpte(hptep);
+
+	hpte_v = hptep->v;
+
+	/* Even if we miss, we need to invalidate the TLB */
+	if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
+	    || !(hpte_v & HPTE_V_VALID)) {
+		native_unlock_hpte(hptep);
+		ret = -1;
+	} else {
+		set_pp_bit(newpp, hptep);
+		native_unlock_hpte(hptep);
+	}
+
+	/* Ensure it is out of the tlb too */
+	if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+		tlbiel(va);
+	} else {
+		int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+		if (lock_tlbie)
+			spin_lock(&native_tlbie_lock);
+		tlbie(va, large);
+		if (lock_tlbie)
+			spin_unlock(&native_tlbie_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Update the page protection bits. Intended to be used to create
+ * guard pages for kernel data structures on pages which are bolted
+ * in the HPT. Assumes pages being operated on will not be stolen.
+ * Does not work on large pages.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
+{
+	unsigned long vsid, va, vpn, flags = 0;
+	long slot;
+	hpte_t *hptep;
+	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+	vsid = get_kernel_vsid(ea);
+	va = (vsid << 28) | (ea & 0x0fffffff);
+	vpn = va >> PAGE_SHIFT;
+
+	slot = native_hpte_find(vpn);
+	if (slot == -1)
+		panic("could not find page to bolt\n");
+	hptep = htab_address + slot;
+
+	set_pp_bit(newpp, hptep);
+
+	/* Ensure it is out of the tlb too */
+	if (lock_tlbie)
+		spin_lock_irqsave(&native_tlbie_lock, flags);
+	tlbie(va, 0);
+	if (lock_tlbie)
+		spin_unlock_irqrestore(&native_tlbie_lock, flags);
+}
+
+static void native_hpte_invalidate(unsigned long slot, unsigned long va,
+				    int large, int local)
+{
+	hpte_t *hptep = htab_address + slot;
+	unsigned long hpte_v;
+	unsigned long avpn = va >> 23;
+	unsigned long flags;
+	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+	if (large)
+		avpn &= ~1;
+
+	local_irq_save(flags);
+	native_lock_hpte(hptep);
+
+	hpte_v = hptep->v;
+
+	/* Even if we miss, we need to invalidate the TLB */
+	if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
+	    || !(hpte_v & HPTE_V_VALID)) {
+		native_unlock_hpte(hptep);
+	} else {
+		/* Invalidate the hpte. NOTE: this also unlocks it */
+		hptep->v = 0;
+	}
+
+	/* Invalidate the tlb */
+	if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+		tlbiel(va);
+	} else {
+		if (lock_tlbie)
+			spin_lock(&native_tlbie_lock);
+		tlbie(va, large);
+		if (lock_tlbie)
+			spin_unlock(&native_tlbie_lock);
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * clear all mappings on kexec.  All cpus are in real mode (or they will
+ * be when they isi), and we are the only one left.  We rely on our kernel
+ * mapping being 0xC0's and the hardware ignoring those two real bits.
+ *
+ * TODO: add batching support when enabled.  remember, no dynamic memory here,
+ * athough there is the control page available...
+ */
+static void native_hpte_clear(void)
+{
+	unsigned long slot, slots, flags;
+	hpte_t *hptep = htab_address;
+	unsigned long hpte_v;
+	unsigned long pteg_count;
+
+	pteg_count = htab_hash_mask + 1;
+
+	local_irq_save(flags);
+
+	/* we take the tlbie lock and hold it.  Some hardware will
+	 * deadlock if we try to tlbie from two processors at once.
+	 */
+	spin_lock(&native_tlbie_lock);
+
+	slots = pteg_count * HPTES_PER_GROUP;
+
+	for (slot = 0; slot < slots; slot++, hptep++) {
+		/*
+		 * we could lock the pte here, but we are the only cpu
+		 * running,  right?  and for crash dump, we probably
+		 * don't want to wait for a maybe bad cpu.
+		 */
+		hpte_v = hptep->v;
+
+		if (hpte_v & HPTE_V_VALID) {
+			hptep->v = 0;
+			tlbie(slot2va(hpte_v, slot), hpte_v & HPTE_V_LARGE);
+		}
+	}
+
+	spin_unlock(&native_tlbie_lock);
+	local_irq_restore(flags);
+}
+
+static void native_flush_hash_range(unsigned long number, int local)
+{
+	unsigned long va, vpn, hash, secondary, slot, flags, avpn;
+	int i, j;
+	hpte_t *hptep;
+	unsigned long hpte_v;
+	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	unsigned long large = batch->large;
+
+	local_irq_save(flags);
+
+	j = 0;
+	for (i = 0; i < number; i++) {
+		va = batch->vaddr[j];
+		if (large)
+			vpn = va >> HPAGE_SHIFT;
+		else
+			vpn = va >> PAGE_SHIFT;
+		hash = hpt_hash(vpn, large);
+		secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15;
+		if (secondary)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12;
+
+		hptep = htab_address + slot;
+
+		avpn = va >> 23;
+		if (large)
+			avpn &= ~0x1UL;
+
+		native_lock_hpte(hptep);
+
+		hpte_v = hptep->v;
+
+		/* Even if we miss, we need to invalidate the TLB */
+		if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
+		    || !(hpte_v & HPTE_V_VALID)) {
+			native_unlock_hpte(hptep);
+		} else {
+			/* Invalidate the hpte. NOTE: this also unlocks it */
+			hptep->v = 0;
+		}
+
+		j++;
+	}
+
+	if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+		asm volatile("ptesync":::"memory");
+
+		for (i = 0; i < j; i++)
+			__tlbiel(batch->vaddr[i]);
+
+		asm volatile("ptesync":::"memory");
+	} else {
+		int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+		if (lock_tlbie)
+			spin_lock(&native_tlbie_lock);
+
+		asm volatile("ptesync":::"memory");
+
+		for (i = 0; i < j; i++)
+			__tlbie(batch->vaddr[i], large);
+
+		asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+		if (lock_tlbie)
+			spin_unlock(&native_tlbie_lock);
+	}
+
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PPC_PSERIES
+/* Disable TLB batching on nighthawk */
+static inline int tlb_batching_enabled(void)
+{
+	struct device_node *root = of_find_node_by_path("/");
+	int enabled = 1;
+
+	if (root) {
+		const char *model = get_property(root, "model", NULL);
+		if (model && !strcmp(model, "IBM,9076-N81"))
+			enabled = 0;
+		of_node_put(root);
+	}
+
+	return enabled;
+}
+#else
+static inline int tlb_batching_enabled(void)
+{
+	return 1;
+}
+#endif
+
+void hpte_init_native(void)
+{
+	ppc_md.hpte_invalidate	= native_hpte_invalidate;
+	ppc_md.hpte_updatepp	= native_hpte_updatepp;
+	ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp;
+	ppc_md.hpte_insert	= native_hpte_insert;
+	ppc_md.hpte_remove	= native_hpte_remove;
+	ppc_md.hpte_clear_all	= native_hpte_clear;
+	if (tlb_batching_enabled())
+		ppc_md.flush_hash_range = native_flush_hash_range;
+	htab_finish_init();
+}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
new file mode 100644
index 000000000000..35dd93eeaf4b
--- /dev/null
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -0,0 +1,438 @@
+/*
+ * PowerPC64 port by Mike Corrigan and Dave Engebretsen
+ *   {mikejc|engebret}@us.ibm.com
+ *
+ *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ *    Module name: htab.c
+ *
+ *    Description:
+ *      PowerPC Hashed Page Table functions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+
+#include <asm/ppcdebug.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/lmb.h>
+#include <asm/abs_addr.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/eeh.h>
+#include <asm/tlb.h>
+#include <asm/cacheflush.h>
+#include <asm/cputable.h>
+#include <asm/abs_addr.h>
+#include <asm/sections.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/*
+ * Note:  pte   --> Linux PTE
+ *        HPTE  --> PowerPC Hashed Page Table Entry
+ *
+ * Execution context:
+ *   htab_initialize is called with the MMU off (of course), but
+ *   the kernel has been copied down to zero so it can directly
+ *   reference global data.  At this point it is very difficult
+ *   to print debug info.
+ *
+ */
+
+#ifdef CONFIG_U3_DART
+extern unsigned long dart_tablebase;
+#endif /* CONFIG_U3_DART */
+
+hpte_t *htab_address;
+unsigned long htab_hash_mask;
+
+unsigned long _SDR1;
+
+#define KB (1024)
+#define MB (1024*KB)
+
+static inline void loop_forever(void)
+{
+	volatile unsigned long x = 1;
+	for(;x;x|=1)
+		;
+}
+
+static inline void create_pte_mapping(unsigned long start, unsigned long end,
+				      unsigned long mode, int large)
+{
+	unsigned long addr;
+	unsigned int step;
+	unsigned long tmp_mode;
+	unsigned long vflags;
+
+	if (large) {
+		step = 16*MB;
+		vflags = HPTE_V_BOLTED | HPTE_V_LARGE;
+	} else {
+		step = 4*KB;
+		vflags = HPTE_V_BOLTED;
+	}
+
+	for (addr = start; addr < end; addr += step) {
+		unsigned long vpn, hash, hpteg;
+		unsigned long vsid = get_kernel_vsid(addr);
+		unsigned long va = (vsid << 28) | (addr & 0xfffffff);
+		int ret = -1;
+
+		if (large)
+			vpn = va >> HPAGE_SHIFT;
+		else
+			vpn = va >> PAGE_SHIFT;
+
+
+		tmp_mode = mode;
+		
+		/* Make non-kernel text non-executable */
+		if (!in_kernel_text(addr))
+			tmp_mode = mode | HW_NO_EXEC;
+
+		hash = hpt_hash(vpn, large);
+
+		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+#ifdef CONFIG_PPC_ISERIES
+		if (systemcfg->platform & PLATFORM_ISERIES_LPAR)
+			ret = iSeries_hpte_bolt_or_insert(hpteg, va,
+				virt_to_abs(addr) >> PAGE_SHIFT,
+				vflags, tmp_mode);
+		else
+#endif
+#ifdef CONFIG_PPC_PSERIES
+		if (systemcfg->platform & PLATFORM_LPAR)
+			ret = pSeries_lpar_hpte_insert(hpteg, va,
+				virt_to_abs(addr) >> PAGE_SHIFT,
+				vflags, tmp_mode);
+		else
+#endif
+#ifdef CONFIG_PPC_MULTIPLATFORM
+			ret = native_hpte_insert(hpteg, va,
+				virt_to_abs(addr) >> PAGE_SHIFT,
+				vflags, tmp_mode);
+#endif
+
+		if (ret == -1) {
+			ppc64_terminate_msg(0x20, "create_pte_mapping");
+			loop_forever();
+		}
+	}
+}
+
+void __init htab_initialize(void)
+{
+	unsigned long table, htab_size_bytes;
+	unsigned long pteg_count;
+	unsigned long mode_rw;
+	int i, use_largepages = 0;
+	unsigned long base = 0, size = 0;
+	extern unsigned long tce_alloc_start, tce_alloc_end;
+
+	DBG(" -> htab_initialize()\n");
+
+	/*
+	 * Calculate the required size of the htab.  We want the number of
+	 * PTEGs to equal one half the number of real pages.
+	 */ 
+	htab_size_bytes = 1UL << ppc64_pft_size;
+	pteg_count = htab_size_bytes >> 7;
+
+	/* For debug, make the HTAB 1/8 as big as it normally would be. */
+	ifppcdebug(PPCDBG_HTABSIZE) {
+		pteg_count >>= 3;
+		htab_size_bytes = pteg_count << 7;
+	}
+
+	htab_hash_mask = pteg_count - 1;
+
+	if (systemcfg->platform & PLATFORM_LPAR) {
+		/* Using a hypervisor which owns the htab */
+		htab_address = NULL;
+		_SDR1 = 0; 
+	} else {
+		/* Find storage for the HPT.  Must be contiguous in
+		 * the absolute address space.
+		 */
+		table = lmb_alloc(htab_size_bytes, htab_size_bytes);
+
+		DBG("Hash table allocated at %lx, size: %lx\n", table,
+		    htab_size_bytes);
+
+		if ( !table ) {
+			ppc64_terminate_msg(0x20, "hpt space");
+			loop_forever();
+		}
+		htab_address = abs_to_virt(table);
+
+		/* htab absolute addr + encoded htabsize */
+		_SDR1 = table + __ilog2(pteg_count) - 11;
+
+		/* Initialize the HPT with no entries */
+		memset((void *)table, 0, htab_size_bytes);
+	}
+
+	mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
+
+	/* On U3 based machines, we need to reserve the DART area and
+	 * _NOT_ map it to avoid cache paradoxes as it's remapped non
+	 * cacheable later on
+	 */
+	if (cpu_has_feature(CPU_FTR_16M_PAGE))
+		use_largepages = 1;
+
+	/* create bolted the linear mapping in the hash table */
+	for (i=0; i < lmb.memory.cnt; i++) {
+		base = lmb.memory.region[i].base + KERNELBASE;
+		size = lmb.memory.region[i].size;
+
+		DBG("creating mapping for region: %lx : %lx\n", base, size);
+
+#ifdef CONFIG_U3_DART
+		/* Do not map the DART space. Fortunately, it will be aligned
+		 * in such a way that it will not cross two lmb regions and will
+		 * fit within a single 16Mb page.
+		 * The DART space is assumed to be a full 16Mb region even if we
+		 * only use 2Mb of that space. We will use more of it later for
+		 * AGP GART. We have to use a full 16Mb large page.
+		 */
+		DBG("DART base: %lx\n", dart_tablebase);
+
+		if (dart_tablebase != 0 && dart_tablebase >= base
+		    && dart_tablebase < (base + size)) {
+			if (base != dart_tablebase)
+				create_pte_mapping(base, dart_tablebase, mode_rw,
+						   use_largepages);
+			if ((base + size) > (dart_tablebase + 16*MB))
+				create_pte_mapping(dart_tablebase + 16*MB, base + size,
+						   mode_rw, use_largepages);
+			continue;
+		}
+#endif /* CONFIG_U3_DART */
+		create_pte_mapping(base, base + size, mode_rw, use_largepages);
+	}
+
+	/*
+	 * If we have a memory_limit and we've allocated TCEs then we need to
+	 * explicitly map the TCE area at the top of RAM. We also cope with the
+	 * case that the TCEs start below memory_limit.
+	 * tce_alloc_start/end are 16MB aligned so the mapping should work
+	 * for either 4K or 16MB pages.
+	 */
+	if (tce_alloc_start) {
+		tce_alloc_start += KERNELBASE;
+		tce_alloc_end += KERNELBASE;
+
+		if (base + size >= tce_alloc_start)
+			tce_alloc_start = base + size + 1;
+
+		create_pte_mapping(tce_alloc_start, tce_alloc_end,
+			mode_rw, use_largepages);
+	}
+
+	DBG(" <- htab_initialize()\n");
+}
+#undef KB
+#undef MB
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
+{
+	struct page *page;
+
+	if (!pfn_valid(pte_pfn(pte)))
+		return pp;
+
+	page = pte_page(pte);
+
+	/* page is dirty */
+	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+		if (trap == 0x400) {
+			__flush_dcache_icache(page_address(page));
+			set_bit(PG_arch_1, &page->flags);
+		} else
+			pp |= HW_NO_EXEC;
+	}
+	return pp;
+}
+
+/* Result code is:
+ *  0 - handled
+ *  1 - normal page fault
+ * -1 - critical hash insertion error
+ */
+int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
+{
+	void *pgdir;
+	unsigned long vsid;
+	struct mm_struct *mm;
+	pte_t *ptep;
+	int ret;
+	int user_region = 0;
+	int local = 0;
+	cpumask_t tmp;
+
+	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
+		return 1;
+
+ 	switch (REGION_ID(ea)) {
+	case USER_REGION_ID:
+		user_region = 1;
+		mm = current->mm;
+		if (! mm)
+			return 1;
+
+		vsid = get_vsid(mm->context.id, ea);
+		break;
+	case VMALLOC_REGION_ID:
+		mm = &init_mm;
+		vsid = get_kernel_vsid(ea);
+		break;
+#if 0
+	case KERNEL_REGION_ID:
+		/*
+		 * Should never get here - entire 0xC0... region is bolted.
+		 * Send the problem up to do_page_fault 
+		 */
+#endif
+	default:
+		/* Not a valid range
+		 * Send the problem up to do_page_fault 
+		 */
+		return 1;
+		break;
+	}
+
+	pgdir = mm->pgd;
+
+	if (pgdir == NULL)
+		return 1;
+
+	tmp = cpumask_of_cpu(smp_processor_id());
+	if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
+		local = 1;
+
+	/* Is this a huge page ? */
+	if (unlikely(in_hugepage_area(mm->context, ea)))
+		ret = hash_huge_page(mm, access, ea, vsid, local);
+	else {
+		ptep = find_linux_pte(pgdir, ea);
+		if (ptep == NULL)
+			return 1;
+		ret = __hash_page(ea, access, vsid, ptep, trap, local);
+	}
+
+	return ret;
+}
+
+void flush_hash_page(unsigned long va, pte_t pte, int local)
+{
+	unsigned long vpn, hash, secondary, slot;
+	unsigned long huge = pte_huge(pte);
+
+	if (huge)
+		vpn = va >> HPAGE_SHIFT;
+	else
+		vpn = va >> PAGE_SHIFT;
+	hash = hpt_hash(vpn, huge);
+	secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
+	if (secondary)
+		hash = ~hash;
+	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+	slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12;
+
+	ppc_md.hpte_invalidate(slot, va, huge, local);
+}
+
+void flush_hash_range(unsigned long number, int local)
+{
+	if (ppc_md.flush_hash_range) {
+		ppc_md.flush_hash_range(number, local);
+	} else {
+		int i;
+		struct ppc64_tlb_batch *batch =
+			&__get_cpu_var(ppc64_tlb_batch);
+
+		for (i = 0; i < number; i++)
+			flush_hash_page(batch->vaddr[i], batch->pte[i], local);
+	}
+}
+
+static inline void make_bl(unsigned int *insn_addr, void *func)
+{
+	unsigned long funcp = *((unsigned long *)func);
+	int offset = funcp - (unsigned long)insn_addr;
+
+	*insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc));
+	flush_icache_range((unsigned long)insn_addr, 4+
+			   (unsigned long)insn_addr);
+}
+
+/*
+ * low_hash_fault is called when we the low level hash code failed
+ * to instert a PTE due to an hypervisor error
+ */
+void low_hash_fault(struct pt_regs *regs, unsigned long address)
+{
+	if (user_mode(regs)) {
+		siginfo_t info;
+
+		info.si_signo = SIGBUS;
+		info.si_errno = 0;
+		info.si_code = BUS_ADRERR;
+		info.si_addr = (void __user *)address;
+		force_sig_info(SIGBUS, &info, current);
+		return;
+	}
+	bad_page_fault(regs, address, SIGBUS);
+}
+
+void __init htab_finish_init(void)
+{
+	extern unsigned int *htab_call_hpte_insert1;
+	extern unsigned int *htab_call_hpte_insert2;
+	extern unsigned int *htab_call_hpte_remove;
+	extern unsigned int *htab_call_hpte_updatepp;
+
+	make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
+	make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
+	make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
+	make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
new file mode 100644
index 000000000000..0ea0994ed974
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -0,0 +1,745 @@
+/*
+ * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/tlb.h>
+
+#include <linux/sysctl.h>
+
+#define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
+#define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+
+/* Modelled after find_linux_pte() */
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	pte_t *pt;
+
+	BUG_ON(! in_hugepage_area(mm->context, addr));
+
+	addr &= HPAGE_MASK;
+
+	pg = pgd_offset(mm, addr);
+	if (!pgd_none(*pg)) {
+		pu = pud_offset(pg, addr);
+		if (!pud_none(*pu)) {
+			pm = pmd_offset(pu, addr);
+			pt = (pte_t *)pm;
+			BUG_ON(!pmd_none(*pm)
+			       && !(pte_present(*pt) && pte_huge(*pt)));
+			return pt;
+		}
+	}
+
+	return NULL;
+}
+
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	pte_t *pt;
+
+	BUG_ON(! in_hugepage_area(mm->context, addr));
+
+	addr &= HPAGE_MASK;
+
+	pg = pgd_offset(mm, addr);
+	pu = pud_alloc(mm, pg, addr);
+
+	if (pu) {
+		pm = pmd_alloc(mm, pu, addr);
+		if (pm) {
+			pt = (pte_t *)pm;
+			BUG_ON(!pmd_none(*pm)
+			       && !(pte_present(*pt) && pte_huge(*pt)));
+			return pt;
+		}
+	}
+
+	return NULL;
+}
+
+#define HUGEPTE_BATCH_SIZE	(HPAGE_SIZE / PMD_SIZE)
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte)
+{
+	int i;
+
+	if (pte_present(*ptep)) {
+		pte_clear(mm, addr, ptep);
+		flush_tlb_pending();
+	}
+
+	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
+		*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+		ptep++;
+	}
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
+{
+	unsigned long old = pte_update(ptep, ~0UL);
+	int i;
+
+	if (old & _PAGE_HASHPTE)
+		hpte_update(mm, addr, old, 0);
+
+	for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
+		ptep[i] = __pte(0);
+
+	return __pte(old);
+}
+
+/*
+ * This function checks for proper alignment of input addr and len parameters.
+ */
+int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
+{
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	if (addr & ~HPAGE_MASK)
+		return -EINVAL;
+	if (! (within_hugepage_low_range(addr, len)
+	       || within_hugepage_high_range(addr, len)) )
+		return -EINVAL;
+	return 0;
+}
+
+static void flush_low_segments(void *parm)
+{
+	u16 areas = (unsigned long) parm;
+	unsigned long i;
+
+	asm volatile("isync" : : : "memory");
+
+	BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
+
+	for (i = 0; i < NUM_LOW_AREAS; i++) {
+		if (! (areas & (1U << i)))
+			continue;
+		asm volatile("slbie %0"
+			     : : "r" ((i << SID_SHIFT) | SLBIE_C));
+	}
+
+	asm volatile("isync" : : : "memory");
+}
+
+static void flush_high_segments(void *parm)
+{
+	u16 areas = (unsigned long) parm;
+	unsigned long i, j;
+
+	asm volatile("isync" : : : "memory");
+
+	BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+
+	for (i = 0; i < NUM_HIGH_AREAS; i++) {
+		if (! (areas & (1U << i)))
+			continue;
+		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
+			asm volatile("slbie %0"
+				     :: "r" (((i << HTLB_AREA_SHIFT)
+					     + (j << SID_SHIFT)) | SLBIE_C));
+	}
+
+	asm volatile("isync" : : : "memory");
+}
+
+static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+	unsigned long start = area << SID_SHIFT;
+	unsigned long end = (area+1) << SID_SHIFT;
+	struct vm_area_struct *vma;
+
+	BUG_ON(area >= NUM_LOW_AREAS);
+
+	/* Check no VMAs are in the region */
+	vma = find_vma(mm, start);
+	if (vma && (vma->vm_start < end))
+		return -EBUSY;
+
+	return 0;
+}
+
+static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+	unsigned long start = area << HTLB_AREA_SHIFT;
+	unsigned long end = (area+1) << HTLB_AREA_SHIFT;
+	struct vm_area_struct *vma;
+
+	BUG_ON(area >= NUM_HIGH_AREAS);
+
+	/* Check no VMAs are in the region */
+	vma = find_vma(mm, start);
+	if (vma && (vma->vm_start < end))
+		return -EBUSY;
+
+	return 0;
+}
+
+static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
+{
+	unsigned long i;
+
+	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
+	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
+
+	newareas &= ~(mm->context.low_htlb_areas);
+	if (! newareas)
+		return 0; /* The segments we want are already open */
+
+	for (i = 0; i < NUM_LOW_AREAS; i++)
+		if ((1 << i) & newareas)
+			if (prepare_low_area_for_htlb(mm, i) != 0)
+				return -EBUSY;
+
+	mm->context.low_htlb_areas |= newareas;
+
+	/* update the paca copy of the context struct */
+	get_paca()->context = mm->context;
+
+	/* the context change must make it to memory before the flush,
+	 * so that further SLB misses do the right thing. */
+	mb();
+	on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+
+	return 0;
+}
+
+static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
+{
+	unsigned long i;
+
+	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
+	BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
+		     != NUM_HIGH_AREAS);
+
+	newareas &= ~(mm->context.high_htlb_areas);
+	if (! newareas)
+		return 0; /* The areas we want are already open */
+
+	for (i = 0; i < NUM_HIGH_AREAS; i++)
+		if ((1 << i) & newareas)
+			if (prepare_high_area_for_htlb(mm, i) != 0)
+				return -EBUSY;
+
+	mm->context.high_htlb_areas |= newareas;
+
+	/* update the paca copy of the context struct */
+	get_paca()->context = mm->context;
+
+	/* the context change must make it to memory before the flush,
+	 * so that further SLB misses do the right thing. */
+	mb();
+	on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
+
+	return 0;
+}
+
+int prepare_hugepage_range(unsigned long addr, unsigned long len)
+{
+	int err;
+
+	if ( (addr+len) < addr )
+		return -EINVAL;
+
+	if ((addr + len) < 0x100000000UL)
+		err = open_low_hpage_areas(current->mm,
+					  LOW_ESID_MASK(addr, len));
+	else
+		err = open_high_hpage_areas(current->mm,
+					    HTLB_AREA_MASK(addr, len));
+	if (err) {
+		printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
+		       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
+		       addr, len,
+		       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
+		return err;
+	}
+
+	return 0;
+}
+
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+	pte_t *ptep;
+	struct page *page;
+
+	if (! in_hugepage_area(mm->context, address))
+		return ERR_PTR(-EINVAL);
+
+	ptep = huge_pte_offset(mm, address);
+	page = pte_page(*ptep);
+	if (page)
+		page += (address % HPAGE_SIZE) / PAGE_SIZE;
+
+	return page;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+	return 0;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+		pmd_t *pmd, int write)
+{
+	BUG();
+	return NULL;
+}
+
+/* Because we have an exclusive hugepage region which lies within the
+ * normal user address space, we have to take special measures to make
+ * non-huge mmap()s evade the hugepage reserved regions. */
+unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
+				     unsigned long len, unsigned long pgoff,
+				     unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long start_addr;
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(mm, addr);
+		if (((TASK_SIZE - len) >= addr)
+		    && (!vma || (addr+len) <= vma->vm_start)
+		    && !is_hugepage_only_range(mm, addr,len))
+			return addr;
+	}
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
+
+full_search:
+	vma = find_vma(mm, addr);
+	while (TASK_SIZE - len >= addr) {
+		BUG_ON(vma && (addr >= vma->vm_end));
+
+		if (touches_hugepage_low_range(mm, addr, len)) {
+			addr = ALIGN(addr+1, 1<<SID_SHIFT);
+			vma = find_vma(mm, addr);
+			continue;
+		}
+		if (touches_hugepage_high_range(mm, addr, len)) {
+			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
+			vma = find_vma(mm, addr);
+			continue;
+		}
+		if (!vma || addr + len <= vma->vm_start) {
+			/*
+			 * Remember the place where we stopped the search:
+			 */
+			mm->free_area_cache = addr + len;
+			return addr;
+		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+		addr = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+	/* Make sure we didn't miss any holes */
+	if (start_addr != TASK_UNMAPPED_BASE) {
+		start_addr = addr = TASK_UNMAPPED_BASE;
+		mm->cached_hole_size = 0;
+		goto full_search;
+	}
+	return -ENOMEM;
+}
+
+/*
+ * This mmap-allocator allocates new areas top-down from below the
+ * stack's low limit (the base):
+ *
+ * Because we have an exclusive hugepage region which lies within the
+ * normal user address space, we have to take special measures to make
+ * non-huge mmap()s evade the hugepage reserved regions.
+ */
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+			  const unsigned long len, const unsigned long pgoff,
+			  const unsigned long flags)
+{
+	struct vm_area_struct *vma, *prev_vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
+	int first_time = 1;
+
+	/* requested length too big for entire address space */
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	/* dont allow allocations above current base */
+	if (mm->free_area_cache > base)
+		mm->free_area_cache = base;
+
+	/* requesting a specific address */
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+				(!vma || addr + len <= vma->vm_start)
+				&& !is_hugepage_only_range(mm, addr,len))
+			return addr;
+	}
+
+	if (len <= largest_hole) {
+	        largest_hole = 0;
+		mm->free_area_cache = base;
+	}
+try_again:
+	/* make sure it can fit in the remaining address space */
+	if (mm->free_area_cache < len)
+		goto fail;
+
+	/* either no address requested or cant fit in requested address hole */
+	addr = (mm->free_area_cache - len) & PAGE_MASK;
+	do {
+hugepage_recheck:
+		if (touches_hugepage_low_range(mm, addr, len)) {
+			addr = (addr & ((~0) << SID_SHIFT)) - len;
+			goto hugepage_recheck;
+		} else if (touches_hugepage_high_range(mm, addr, len)) {
+			addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
+			goto hugepage_recheck;
+		}
+
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * i.e. return with success:
+		 */
+ 	 	if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
+			return addr;
+
+		/*
+		 * new region fits between prev_vma->vm_end and
+		 * vma->vm_start, use it:
+		 */
+		if (addr+len <= vma->vm_start &&
+		          (!prev_vma || (addr >= prev_vma->vm_end))) {
+			/* remember the address as a hint for next time */
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else {
+			/* pull free_area_cache down to the first hole */
+		        if (mm->free_area_cache == vma->vm_end) {
+				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+		}
+
+		/* remember the largest hole we saw so far */
+		if (addr + largest_hole < vma->vm_start)
+		        largest_hole = vma->vm_start - addr;
+
+		/* try just below the current vma->vm_start */
+		addr = vma->vm_start-len;
+	} while (len <= vma->vm_start);
+
+fail:
+	/*
+	 * if hint left us with no space for the requested
+	 * mapping then try again:
+	 */
+	if (first_time) {
+		mm->free_area_cache = base;
+		largest_hole = 0;
+		first_time = 0;
+		goto try_again;
+	}
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
+	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+	/*
+	 * Restore the topdown base:
+	 */
+	mm->free_area_cache = base;
+	mm->cached_hole_size = ~0UL;
+
+	return addr;
+}
+
+static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
+{
+	unsigned long addr = 0;
+	struct vm_area_struct *vma;
+
+	vma = find_vma(current->mm, addr);
+	while (addr + len <= 0x100000000UL) {
+		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
+
+		if (! __within_hugepage_low_range(addr, len, segmask)) {
+			addr = ALIGN(addr+1, 1<<SID_SHIFT);
+			vma = find_vma(current->mm, addr);
+			continue;
+		}
+
+		if (!vma || (addr + len) <= vma->vm_start)
+			return addr;
+		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+		/* Depending on segmask this might not be a confirmed
+		 * hugepage region, so the ALIGN could have skipped
+		 * some VMAs */
+		vma = find_vma(current->mm, addr);
+	}
+
+	return -ENOMEM;
+}
+
+static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
+{
+	unsigned long addr = 0x100000000UL;
+	struct vm_area_struct *vma;
+
+	vma = find_vma(current->mm, addr);
+	while (addr + len <= TASK_SIZE_USER64) {
+		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
+
+		if (! __within_hugepage_high_range(addr, len, areamask)) {
+			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
+			vma = find_vma(current->mm, addr);
+			continue;
+		}
+
+		if (!vma || (addr + len) <= vma->vm_start)
+			return addr;
+		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+		/* Depending on segmask this might not be a confirmed
+		 * hugepage region, so the ALIGN could have skipped
+		 * some VMAs */
+		vma = find_vma(current->mm, addr);
+	}
+
+	return -ENOMEM;
+}
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+					unsigned long len, unsigned long pgoff,
+					unsigned long flags)
+{
+	int lastshift;
+	u16 areamask, curareas;
+
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+
+	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
+		return -EINVAL;
+
+	if (test_thread_flag(TIF_32BIT)) {
+		curareas = current->mm->context.low_htlb_areas;
+
+		/* First see if we can do the mapping in the existing
+		 * low areas */
+		addr = htlb_get_low_area(len, curareas);
+		if (addr != -ENOMEM)
+			return addr;
+
+		lastshift = 0;
+		for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
+		     ! lastshift; areamask >>=1) {
+			if (areamask & 1)
+				lastshift = 1;
+
+			addr = htlb_get_low_area(len, curareas | areamask);
+			if ((addr != -ENOMEM)
+			    && open_low_hpage_areas(current->mm, areamask) == 0)
+				return addr;
+		}
+	} else {
+		curareas = current->mm->context.high_htlb_areas;
+
+		/* First see if we can do the mapping in the existing
+		 * high areas */
+		addr = htlb_get_high_area(len, curareas);
+		if (addr != -ENOMEM)
+			return addr;
+
+		lastshift = 0;
+		for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
+		     ! lastshift; areamask >>=1) {
+			if (areamask & 1)
+				lastshift = 1;
+
+			addr = htlb_get_high_area(len, curareas | areamask);
+			if ((addr != -ENOMEM)
+			    && open_high_hpage_areas(current->mm, areamask) == 0)
+				return addr;
+		}
+	}
+	printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
+	       " enough areas\n");
+	return -ENOMEM;
+}
+
+int hash_huge_page(struct mm_struct *mm, unsigned long access,
+		   unsigned long ea, unsigned long vsid, int local)
+{
+	pte_t *ptep;
+	unsigned long va, vpn;
+	pte_t old_pte, new_pte;
+	unsigned long rflags, prpn;
+	long slot;
+	int err = 1;
+
+	spin_lock(&mm->page_table_lock);
+
+	ptep = huge_pte_offset(mm, ea);
+
+	/* Search the Linux page table for a match with va */
+	va = (vsid << 28) | (ea & 0x0fffffff);
+	vpn = va >> HPAGE_SHIFT;
+
+	/*
+	 * If no pte found or not present, send the problem up to
+	 * do_page_fault
+	 */
+	if (unlikely(!ptep || pte_none(*ptep)))
+		goto out;
+
+/* 	BUG_ON(pte_bad(*ptep)); */
+
+	/* 
+	 * Check the user's access rights to the page.  If access should be
+	 * prevented then send the problem up to do_page_fault.
+	 */
+	if (unlikely(access & ~pte_val(*ptep)))
+		goto out;
+	/*
+	 * At this point, we have a pte (old_pte) which can be used to build
+	 * or update an HPTE. There are 2 cases:
+	 *
+	 * 1. There is a valid (present) pte with no associated HPTE (this is 
+	 *	the most common case)
+	 * 2. There is a valid (present) pte with an associated HPTE. The
+	 *	current values of the pp bits in the HPTE prevent access
+	 *	because we are doing software DIRTY bit management and the
+	 *	page is currently not DIRTY. 
+	 */
+
+
+	old_pte = *ptep;
+	new_pte = old_pte;
+
+	rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
+ 	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
+	rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
+
+	/* Check if pte already has an hpte (case 2) */
+	if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
+		/* There MIGHT be an HPTE for this pte */
+		unsigned long hash, slot;
+
+		hash = hpt_hash(vpn, 1);
+		if (pte_val(old_pte) & _PAGE_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
+
+		if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
+			pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(vpn, 1);
+		unsigned long hpte_group;
+
+		prpn = pte_pfn(old_pte);
+
+repeat:
+		hpte_group = ((hash & htab_hash_mask) *
+			      HPTES_PER_GROUP) & ~0x7UL;
+
+		/* Update the linux pte with the HPTE slot */
+		pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
+		pte_val(new_pte) |= _PAGE_HASHPTE;
+
+		/* Add in WIMG bits */
+		/* XXX We should store these in the pte */
+		rflags |= _PAGE_COHERENT;
+
+		slot = ppc_md.hpte_insert(hpte_group, va, prpn,
+					  HPTE_V_LARGE, rflags);
+
+		/* Primary is full, try the secondary */
+		if (unlikely(slot == -1)) {
+			pte_val(new_pte) |= _PAGE_SECONDARY;
+			hpte_group = ((~hash & htab_hash_mask) *
+				      HPTES_PER_GROUP) & ~0x7UL; 
+			slot = ppc_md.hpte_insert(hpte_group, va, prpn,
+						  HPTE_V_LARGE |
+						  HPTE_V_SECONDARY,
+						  rflags);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_hash_mask) *
+						      HPTES_PER_GROUP)&~0x7UL;
+
+				ppc_md.hpte_remove(hpte_group);
+				goto repeat;
+                        }
+		}
+
+		if (unlikely(slot == -2))
+			panic("hash_huge_page: pte_insert failed\n");
+
+		pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
+
+		/* 
+		 * No need to use ldarx/stdcx here because all who
+		 * might be updating the pte will hold the
+		 * page_table_lock
+		 */
+		*ptep = new_pte;
+	}
+
+	err = 0;
+
+ out:
+	spin_unlock(&mm->page_table_lock);
+
+	return err;
+}
diff --git a/arch/powerpc/mm/imalloc.c b/arch/powerpc/mm/imalloc.c
new file mode 100644
index 000000000000..c65b87b92756
--- /dev/null
+++ b/arch/powerpc/mm/imalloc.c
@@ -0,0 +1,317 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ * 
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/semaphore.h>
+#include <asm/imalloc.h>
+#include <asm/cacheflush.h>
+
+static DECLARE_MUTEX(imlist_sem);
+struct vm_struct * imlist = NULL;
+
+static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
+{
+	unsigned long addr;
+	struct vm_struct **p, *tmp;
+
+	addr = ioremap_bot;
+	for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
+		if (size + addr < (unsigned long) tmp->addr)
+			break;
+		if ((unsigned long)tmp->addr >= ioremap_bot)
+			addr = tmp->size + (unsigned long) tmp->addr;
+		if (addr >= IMALLOC_END-size)
+			return 1;
+	}
+	*im_addr = addr;
+
+	return 0;
+}
+
+/* Return whether the region described by v_addr and size is a subset
+ * of the region described by parent
+ */
+static inline int im_region_is_subset(unsigned long v_addr, unsigned long size,
+			struct vm_struct *parent)
+{
+	return (int) (v_addr >= (unsigned long) parent->addr &&
+	              v_addr < (unsigned long) parent->addr + parent->size &&
+	    	      size < parent->size);
+}
+
+/* Return whether the region described by v_addr and size is a superset
+ * of the region described by child
+ */
+static int im_region_is_superset(unsigned long v_addr, unsigned long size,
+		struct vm_struct *child)
+{
+	struct vm_struct parent;
+
+	parent.addr = (void *) v_addr;
+	parent.size = size;
+
+	return im_region_is_subset((unsigned long) child->addr, child->size,
+			&parent);
+}
+
+/* Return whether the region described by v_addr and size overlaps
+ * the region described by vm.  Overlapping regions meet the
+ * following conditions:
+ * 1) The regions share some part of the address space
+ * 2) The regions aren't identical
+ * 3) Neither region is a subset of the other
+ */
+static int im_region_overlaps(unsigned long v_addr, unsigned long size,
+		     struct vm_struct *vm)
+{
+	if (im_region_is_superset(v_addr, size, vm))
+		return 0;
+
+	return (v_addr + size > (unsigned long) vm->addr + vm->size &&
+		v_addr < (unsigned long) vm->addr + vm->size) ||
+	       (v_addr < (unsigned long) vm->addr &&
+		v_addr + size > (unsigned long) vm->addr);
+}
+
+/* Determine imalloc status of region described by v_addr and size.
+ * Can return one of the following:
+ * IM_REGION_UNUSED   -  Entire region is unallocated in imalloc space.
+ * IM_REGION_SUBSET -    Region is a subset of a region that is already
+ * 			 allocated in imalloc space.
+ * 		         vm will be assigned to a ptr to the parent region.
+ * IM_REGION_EXISTS -    Exact region already allocated in imalloc space.
+ *                       vm will be assigned to a ptr to the existing imlist
+ *                       member.
+ * IM_REGION_OVERLAPS -  Region overlaps an allocated region in imalloc space.
+ * IM_REGION_SUPERSET -  Region is a superset of a region that is already
+ *                       allocated in imalloc space.
+ */
+static int im_region_status(unsigned long v_addr, unsigned long size,
+		    struct vm_struct **vm)
+{
+	struct vm_struct *tmp;
+
+	for (tmp = imlist; tmp; tmp = tmp->next)
+		if (v_addr < (unsigned long) tmp->addr + tmp->size)
+			break;
+
+	if (tmp) {
+		if (im_region_overlaps(v_addr, size, tmp))
+			return IM_REGION_OVERLAP;
+
+		*vm = tmp;
+		if (im_region_is_subset(v_addr, size, tmp)) {
+			/* Return with tmp pointing to superset */
+			return IM_REGION_SUBSET;
+		}
+		if (im_region_is_superset(v_addr, size, tmp)) {
+			/* Return with tmp pointing to first subset */
+			return IM_REGION_SUPERSET;
+		}
+		else if (v_addr == (unsigned long) tmp->addr &&
+		 	 size == tmp->size) {
+			/* Return with tmp pointing to exact region */
+			return IM_REGION_EXISTS;
+		}
+	}
+
+	*vm = NULL;
+	return IM_REGION_UNUSED;
+}
+
+static struct vm_struct * split_im_region(unsigned long v_addr, 
+		unsigned long size, struct vm_struct *parent)
+{
+	struct vm_struct *vm1 = NULL;
+	struct vm_struct *vm2 = NULL;
+	struct vm_struct *new_vm = NULL;
+	
+	vm1 = (struct vm_struct *) kmalloc(sizeof(*vm1), GFP_KERNEL);
+	if (vm1	== NULL) {
+		printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
+		return NULL;
+	}
+
+	if (v_addr == (unsigned long) parent->addr) {
+	        /* Use existing parent vm_struct to represent child, allocate
+		 * new one for the remainder of parent range
+		 */
+		vm1->size = parent->size - size;
+		vm1->addr = (void *) (v_addr + size);
+		vm1->next = parent->next;
+
+		parent->size = size;
+		parent->next = vm1;
+		new_vm = parent;
+	} else if (v_addr + size == (unsigned long) parent->addr + 
+			parent->size) {
+		/* Allocate new vm_struct to represent child, use existing
+		 * parent one for remainder of parent range
+		 */
+		vm1->size = size;
+		vm1->addr = (void *) v_addr;
+		vm1->next = parent->next;
+		new_vm = vm1;
+
+		parent->size -= size;
+		parent->next = vm1;
+	} else {
+	        /* Allocate two new vm_structs for the new child and 
+		 * uppermost remainder, and use existing parent one for the
+		 * lower remainder of parent range
+		 */
+		vm2 = (struct vm_struct *) kmalloc(sizeof(*vm2), GFP_KERNEL);
+		if (vm2 == NULL) {
+			printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
+			kfree(vm1);
+			return NULL;
+		}
+
+		vm1->size = size;
+		vm1->addr = (void *) v_addr;
+		vm1->next = vm2;
+		new_vm = vm1;
+
+		vm2->size = ((unsigned long) parent->addr + parent->size) - 
+				(v_addr + size);
+		vm2->addr = (void *) v_addr + size;
+		vm2->next = parent->next;
+
+		parent->size = v_addr - (unsigned long) parent->addr;
+		parent->next = vm1;
+	}
+
+	return new_vm;
+}
+
+static struct vm_struct * __add_new_im_area(unsigned long req_addr, 
+					    unsigned long size)
+{
+	struct vm_struct **p, *tmp, *area;
+		
+	for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
+		if (req_addr + size <= (unsigned long)tmp->addr)
+			break;
+	}
+	
+	area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
+	if (!area)
+		return NULL;
+	area->flags = 0;
+	area->addr = (void *)req_addr;
+	area->size = size;
+	area->next = *p;
+	*p = area;
+
+	return area;
+}
+
+static struct vm_struct * __im_get_area(unsigned long req_addr, 
+					unsigned long size,
+					int criteria)
+{
+	struct vm_struct *tmp;
+	int status;
+
+	status = im_region_status(req_addr, size, &tmp);
+	if ((criteria & status) == 0) {
+		return NULL;
+	}
+	
+	switch (status) {
+	case IM_REGION_UNUSED:
+		tmp = __add_new_im_area(req_addr, size);
+		break;
+	case IM_REGION_SUBSET:
+		tmp = split_im_region(req_addr, size, tmp);
+		break;
+	case IM_REGION_EXISTS:
+		/* Return requested region */
+		break;
+	case IM_REGION_SUPERSET:
+		/* Return first existing subset of requested region */
+		break;
+	default:
+		printk(KERN_ERR "%s() unexpected imalloc region status\n",
+				__FUNCTION__);
+		tmp = NULL;
+	}
+
+	return tmp;
+}
+
+struct vm_struct * im_get_free_area(unsigned long size)
+{
+	struct vm_struct *area;
+	unsigned long addr;
+	
+	down(&imlist_sem);
+	if (get_free_im_addr(size, &addr)) {
+		printk(KERN_ERR "%s() cannot obtain addr for size 0x%lx\n",
+				__FUNCTION__, size);
+		area = NULL;
+		goto next_im_done;
+	}
+
+	area = __im_get_area(addr, size, IM_REGION_UNUSED);
+	if (area == NULL) {
+		printk(KERN_ERR 
+		       "%s() cannot obtain area for addr 0x%lx size 0x%lx\n",
+			__FUNCTION__, addr, size);
+	}
+next_im_done:
+	up(&imlist_sem);
+	return area;
+}
+
+struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size,
+		int criteria)
+{
+	struct vm_struct *area;
+
+	down(&imlist_sem);
+	area = __im_get_area(v_addr, size, criteria);
+	up(&imlist_sem);
+	return area;
+}
+
+void im_free(void * addr)
+{
+	struct vm_struct **p, *tmp;
+  
+	if (!addr)
+		return;
+	if ((unsigned long) addr & ~PAGE_MASK) {
+		printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__,			addr);
+		return;
+	}
+	down(&imlist_sem);
+	for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
+		if (tmp->addr == addr) {
+			*p = tmp->next;
+
+			/* XXX: do we need the lock? */
+			spin_lock(&init_mm.page_table_lock);
+			unmap_vm_area(tmp);
+			spin_unlock(&init_mm.page_table_lock);
+
+			kfree(tmp);
+			up(&imlist_sem);
+			return;
+		}
+	}
+	up(&imlist_sem);
+	printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__,
+			addr);
+}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index c0ce6a7af3c7..b0fc822ec29f 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -73,18 +73,8 @@
 #warning TASK_SIZE is smaller than it needs to be.
 #endif
 
-int mem_init_done;
-unsigned long ioremap_bot = IMALLOC_BASE;
-static unsigned long phbs_io_bot = PHBS_IO_BASE;
-
-extern pgd_t swapper_pg_dir[];
-extern struct task_struct *current_set[NR_CPUS];
-
 unsigned long klimit = (unsigned long)_end;
 
-unsigned long _SDR1=0;
-unsigned long _ASR=0;
-
 /* max amount of RAM to use */
 unsigned long __max_memory;
 
@@ -193,19 +183,6 @@ static int __init setup_kcore(void)
 }
 module_init(setup_kcore);
 
-void __iomem * reserve_phb_iospace(unsigned long size)
-{
-	void __iomem *virt_addr;
-		
-	if (phbs_io_bot >= IMALLOC_BASE) 
-		panic("reserve_phb_iospace(): phb io space overflow\n");
-			
-	virt_addr = (void __iomem *) phbs_io_bot;
-	phbs_io_bot += size;
-
-	return virt_addr;
-}
-
 static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
 {
 	memset(addr, 0, kmem_cache_size(cache));
@@ -244,16 +221,3 @@ void pgtable_cache_init(void)
 			      name);
 	}
 }
-
-pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
-			      unsigned long size, pgprot_t vma_prot)
-{
-	if (ppc_md.phys_mem_access_prot)
-		return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
-
-	if (!page_is_ram(addr >> PAGE_SHIFT))
-		vma_prot = __pgprot(pgprot_val(vma_prot)
-				    | _PAGE_GUARDED | _PAGE_NO_CACHE);
-	return vma_prot;
-}
-EXPORT_SYMBOL(phys_mem_access_prot);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0650de74d0b3..55b5860ed3c9 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -47,6 +47,9 @@
 #include <asm/prom.h>
 #include <asm/lmb.h>
 #include <asm/sections.h>
+#ifdef CONFIG_PPC64
+#include <asm/vdso.h>
+#endif
 
 #include "mmu_decl.h"
 
@@ -334,7 +337,7 @@ void flush_dcache_icache_page(struct page *page)
 	void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
 	__flush_dcache_icache(start);
 	kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
-#elif defined(CONFIG_8xx)
+#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
 	/* On 8xx there is no need to kmap since highmem is not supported */
 	__flush_dcache_icache(page_address(page)); 
 #else
@@ -463,18 +466,18 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 	if (pgdir == NULL)
 		return;
 
-	ptep = find_linux_pte(pgdir, ea);
+	ptep = find_linux_pte(pgdir, address);
 	if (!ptep)
 		return;
 
-	vsid = get_vsid(vma->vm_mm->context.id, ea);
+	vsid = get_vsid(vma->vm_mm->context.id, address);
 
 	local_irq_save(flags);
 	tmp = cpumask_of_cpu(smp_processor_id());
 	if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
 		local = 1;
 
-	__hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep,
+	__hash_page(address, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep,
 		    0x300, local);
 	local_irq_restore(flags);
 #endif
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
new file mode 100644
index 000000000000..fe65f522aff3
--- /dev/null
+++ b/arch/powerpc/mm/mmap.c
@@ -0,0 +1,86 @@
+/*
+ *  linux/arch/ppc64/mm/mmap.c
+ *
+ *  flexible mmap layout support
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Started by Ingo Molnar <mingo@elte.hu>
+ */
+
+#include <linux/personality.h>
+#include <linux/mm.h>
+
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave an at least ~128 MB hole.
+ */
+#define MIN_GAP (128*1024*1024)
+#define MAX_GAP (TASK_SIZE/6*5)
+
+static inline unsigned long mmap_base(void)
+{
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+
+	if (gap < MIN_GAP)
+		gap = MIN_GAP;
+	else if (gap > MAX_GAP)
+		gap = MAX_GAP;
+
+	return TASK_SIZE - (gap & PAGE_MASK);
+}
+
+static inline int mmap_is_legacy(void)
+{
+	/*
+	 * Force standard allocation for 64 bit programs.
+	 */
+	if (!test_thread_flag(TIF_32BIT))
+		return 1;
+
+	if (current->personality & ADDR_COMPAT_LAYOUT)
+		return 1;
+
+	if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+		return 1;
+
+	return sysctl_legacy_va_layout;
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	/*
+	 * Fall back to the standard layout if the personality
+	 * bit is set, or if the expected stack growth is unlimited:
+	 */
+	if (mmap_is_legacy()) {
+		mm->mmap_base = TASK_UNMAPPED_BASE;
+		mm->get_unmapped_area = arch_get_unmapped_area;
+		mm->unmap_area = arch_unmap_area;
+	} else {
+		mm->mmap_base = mmap_base();
+		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		mm->unmap_area = arch_unmap_area_topdown;
+	}
+}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 06fe8af3af55..a4d7a327c0e5 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -22,11 +22,11 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu.h>
 
+#ifdef CONFIG_PPC32
 extern void mapin_ram(void);
 extern int map_page(unsigned long va, phys_addr_t pa, int flags);
 extern void setbat(int index, unsigned long virt, unsigned long phys,
 		   unsigned int size, int flags);
-extern void reserve_phys_mem(unsigned long start, unsigned long size);
 extern void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 		      unsigned int size, int flags, unsigned int pid);
 extern void invalidate_tlbcam_entry(int index);
@@ -36,16 +36,16 @@ extern unsigned long ioremap_base;
 extern unsigned long ioremap_bot;
 extern unsigned int rtas_data, rtas_size;
 
-extern unsigned long __max_low_memory;
-extern unsigned long __initial_memory_limit;
-extern unsigned long total_memory;
-extern unsigned long total_lowmem;
-extern int mem_init_done;
-
 extern PTE *Hash, *Hash_end;
 extern unsigned long Hash_size, Hash_mask;
 
 extern unsigned int num_tlbcam_entries;
+#endif
+
+extern unsigned long __max_low_memory;
+extern unsigned long __initial_memory_limit;
+extern unsigned long total_memory;
+extern unsigned long total_lowmem;
 
 /* ...and now those things that may be slightly different between processor
  * architectures.  -- Dan
@@ -66,8 +66,8 @@ extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(void);
 extern void adjust_total_lowmem(void);
 
-#else
-/* anything except 4xx or 8xx */
+#elif defined(CONFIG_PPC32)
+/* anything 32-bit except 4xx or 8xx */
 extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(void);
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
new file mode 100644
index 000000000000..cb864b8f2750
--- /dev/null
+++ b/arch/powerpc/mm/numa.c
@@ -0,0 +1,779 @@
+/*
+ * pSeries NUMA support
+ *
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/threads.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <asm/lmb.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+
+static int numa_enabled = 1;
+
+static int numa_debug;
+#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
+
+#ifdef DEBUG_NUMA
+#define ARRAY_INITIALISER -1
+#else
+#define ARRAY_INITIALISER 0
+#endif
+
+int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
+	ARRAY_INITIALISER};
+char *numa_memory_lookup_table;
+cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
+int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
+
+struct pglist_data *node_data[MAX_NUMNODES];
+bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
+static int min_common_depth;
+
+/*
+ * We need somewhere to store start/span for each node until we have
+ * allocated the real node_data structures.
+ */
+static struct {
+	unsigned long node_start_pfn;
+	unsigned long node_end_pfn;
+	unsigned long node_present_pages;
+} init_node_data[MAX_NUMNODES] __initdata;
+
+EXPORT_SYMBOL(node_data);
+EXPORT_SYMBOL(numa_cpu_lookup_table);
+EXPORT_SYMBOL(numa_memory_lookup_table);
+EXPORT_SYMBOL(numa_cpumask_lookup_table);
+EXPORT_SYMBOL(nr_cpus_in_node);
+
+static inline void map_cpu_to_node(int cpu, int node)
+{
+	numa_cpu_lookup_table[cpu] = node;
+	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
+		cpu_set(cpu, numa_cpumask_lookup_table[node]);
+		nr_cpus_in_node[node]++;
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void unmap_cpu_from_node(unsigned long cpu)
+{
+	int node = numa_cpu_lookup_table[cpu];
+
+	dbg("removing cpu %lu from node %d\n", cpu, node);
+
+	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
+		cpu_clear(cpu, numa_cpumask_lookup_table[node]);
+		nr_cpus_in_node[node]--;
+	} else {
+		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
+		       cpu, node);
+	}
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static struct device_node * __devinit find_cpu_node(unsigned int cpu)
+{
+	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
+	struct device_node *cpu_node = NULL;
+	unsigned int *interrupt_server, *reg;
+	int len;
+
+	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
+		/* Try interrupt server first */
+		interrupt_server = (unsigned int *)get_property(cpu_node,
+					"ibm,ppc-interrupt-server#s", &len);
+
+		len = len / sizeof(u32);
+
+		if (interrupt_server && (len > 0)) {
+			while (len--) {
+				if (interrupt_server[len] == hw_cpuid)
+					return cpu_node;
+			}
+		} else {
+			reg = (unsigned int *)get_property(cpu_node,
+							   "reg", &len);
+			if (reg && (len > 0) && (reg[0] == hw_cpuid))
+				return cpu_node;
+		}
+	}
+
+	return NULL;
+}
+
+/* must hold reference to node during call */
+static int *of_get_associativity(struct device_node *dev)
+{
+	return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
+}
+
+static int of_node_numa_domain(struct device_node *device)
+{
+	int numa_domain;
+	unsigned int *tmp;
+
+	if (min_common_depth == -1)
+		return 0;
+
+	tmp = of_get_associativity(device);
+	if (tmp && (tmp[0] >= min_common_depth)) {
+		numa_domain = tmp[min_common_depth];
+	} else {
+		dbg("WARNING: no NUMA information for %s\n",
+		    device->full_name);
+		numa_domain = 0;
+	}
+	return numa_domain;
+}
+
+/*
+ * In theory, the "ibm,associativity" property may contain multiple
+ * associativity lists because a resource may be multiply connected
+ * into the machine.  This resource then has different associativity
+ * characteristics relative to its multiple connections.  We ignore
+ * this for now.  We also assume that all cpu and memory sets have
+ * their distances represented at a common level.  This won't be
+ * true for heirarchical NUMA.
+ *
+ * In any case the ibm,associativity-reference-points should give
+ * the correct depth for a normal NUMA system.
+ *
+ * - Dave Hansen <haveblue@us.ibm.com>
+ */
+static int __init find_min_common_depth(void)
+{
+	int depth;
+	unsigned int *ref_points;
+	struct device_node *rtas_root;
+	unsigned int len;
+
+	rtas_root = of_find_node_by_path("/rtas");
+
+	if (!rtas_root)
+		return -1;
+
+	/*
+	 * this property is 2 32-bit integers, each representing a level of
+	 * depth in the associativity nodes.  The first is for an SMP
+	 * configuration (should be all 0's) and the second is for a normal
+	 * NUMA configuration.
+	 */
+	ref_points = (unsigned int *)get_property(rtas_root,
+			"ibm,associativity-reference-points", &len);
+
+	if ((len >= 1) && ref_points) {
+		depth = ref_points[1];
+	} else {
+		dbg("WARNING: could not find NUMA "
+		    "associativity reference point\n");
+		depth = -1;
+	}
+	of_node_put(rtas_root);
+
+	return depth;
+}
+
+static int __init get_mem_addr_cells(void)
+{
+	struct device_node *memory = NULL;
+	int rc;
+
+	memory = of_find_node_by_type(memory, "memory");
+	if (!memory)
+		return 0; /* it won't matter */
+
+	rc = prom_n_addr_cells(memory);
+	return rc;
+}
+
+static int __init get_mem_size_cells(void)
+{
+	struct device_node *memory = NULL;
+	int rc;
+
+	memory = of_find_node_by_type(memory, "memory");
+	if (!memory)
+		return 0; /* it won't matter */
+	rc = prom_n_size_cells(memory);
+	return rc;
+}
+
+static unsigned long read_n_cells(int n, unsigned int **buf)
+{
+	unsigned long result = 0;
+
+	while (n--) {
+		result = (result << 32) | **buf;
+		(*buf)++;
+	}
+	return result;
+}
+
+/*
+ * Figure out to which domain a cpu belongs and stick it there.
+ * Return the id of the domain used.
+ */
+static int numa_setup_cpu(unsigned long lcpu)
+{
+	int numa_domain = 0;
+	struct device_node *cpu = find_cpu_node(lcpu);
+
+	if (!cpu) {
+		WARN_ON(1);
+		goto out;
+	}
+
+	numa_domain = of_node_numa_domain(cpu);
+
+	if (numa_domain >= num_online_nodes()) {
+		/*
+		 * POWER4 LPAR uses 0xffff as invalid node,
+		 * dont warn in this case.
+		 */
+		if (numa_domain != 0xffff)
+			printk(KERN_ERR "WARNING: cpu %ld "
+			       "maps to invalid NUMA node %d\n",
+			       lcpu, numa_domain);
+		numa_domain = 0;
+	}
+out:
+	node_set_online(numa_domain);
+
+	map_cpu_to_node(lcpu, numa_domain);
+
+	of_node_put(cpu);
+
+	return numa_domain;
+}
+
+static int cpu_numa_callback(struct notifier_block *nfb,
+			     unsigned long action,
+			     void *hcpu)
+{
+	unsigned long lcpu = (unsigned long)hcpu;
+	int ret = NOTIFY_DONE;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (min_common_depth == -1 || !numa_enabled)
+			map_cpu_to_node(lcpu, 0);
+		else
+			numa_setup_cpu(lcpu);
+		ret = NOTIFY_OK;
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+	case CPU_UP_CANCELED:
+		unmap_cpu_from_node(lcpu);
+		break;
+		ret = NOTIFY_OK;
+#endif
+	}
+	return ret;
+}
+
+/*
+ * Check and possibly modify a memory region to enforce the memory limit.
+ *
+ * Returns the size the region should have to enforce the memory limit.
+ * This will either be the original value of size, a truncated value,
+ * or zero. If the returned value of size is 0 the region should be
+ * discarded as it lies wholy above the memory limit.
+ */
+static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
+{
+	/*
+	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
+	 * we've already adjusted it for the limit and it takes care of
+	 * having memory holes below the limit.
+	 */
+	extern unsigned long memory_limit;
+
+	if (! memory_limit)
+		return size;
+
+	if (start + size <= lmb_end_of_DRAM())
+		return size;
+
+	if (start >= lmb_end_of_DRAM())
+		return 0;
+
+	return lmb_end_of_DRAM() - start;
+}
+
+static int __init parse_numa_properties(void)
+{
+	struct device_node *cpu = NULL;
+	struct device_node *memory = NULL;
+	int addr_cells, size_cells;
+	int max_domain = 0;
+	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
+	unsigned long i;
+
+	if (numa_enabled == 0) {
+		printk(KERN_WARNING "NUMA disabled by user\n");
+		return -1;
+	}
+
+	numa_memory_lookup_table =
+		(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
+	memset(numa_memory_lookup_table, 0, entries * sizeof(char));
+
+	for (i = 0; i < entries ; i++)
+		numa_memory_lookup_table[i] = ARRAY_INITIALISER;
+
+	min_common_depth = find_min_common_depth();
+
+	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+	if (min_common_depth < 0)
+		return min_common_depth;
+
+	max_domain = numa_setup_cpu(boot_cpuid);
+
+	/*
+	 * Even though we connect cpus to numa domains later in SMP init,
+	 * we need to know the maximum node id now. This is because each
+	 * node id must have NODE_DATA etc backing it.
+	 * As a result of hotplug we could still have cpus appear later on
+	 * with larger node ids. In that case we force the cpu into node 0.
+	 */
+	for_each_cpu(i) {
+		int numa_domain;
+
+		cpu = find_cpu_node(i);
+
+		if (cpu) {
+			numa_domain = of_node_numa_domain(cpu);
+			of_node_put(cpu);
+
+			if (numa_domain < MAX_NUMNODES &&
+			    max_domain < numa_domain)
+				max_domain = numa_domain;
+		}
+	}
+
+	addr_cells = get_mem_addr_cells();
+	size_cells = get_mem_size_cells();
+	memory = NULL;
+	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+		unsigned long start;
+		unsigned long size;
+		int numa_domain;
+		int ranges;
+		unsigned int *memcell_buf;
+		unsigned int len;
+
+		memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+		if (!memcell_buf || len <= 0)
+			continue;
+
+		ranges = memory->n_addrs;
+new_range:
+		/* these are order-sensitive, and modify the buffer pointer */
+		start = read_n_cells(addr_cells, &memcell_buf);
+		size = read_n_cells(size_cells, &memcell_buf);
+
+		start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
+		size = _ALIGN_UP(size, MEMORY_INCREMENT);
+
+		numa_domain = of_node_numa_domain(memory);
+
+		if (numa_domain >= MAX_NUMNODES) {
+			if (numa_domain != 0xffff)
+				printk(KERN_ERR "WARNING: memory at %lx maps "
+				       "to invalid NUMA node %d\n", start,
+				       numa_domain);
+			numa_domain = 0;
+		}
+
+		if (max_domain < numa_domain)
+			max_domain = numa_domain;
+
+		if (! (size = numa_enforce_memory_limit(start, size))) {
+			if (--ranges)
+				goto new_range;
+			else
+				continue;
+		}
+
+		/*
+		 * Initialize new node struct, or add to an existing one.
+		 */
+		if (init_node_data[numa_domain].node_end_pfn) {
+			if ((start / PAGE_SIZE) <
+			    init_node_data[numa_domain].node_start_pfn)
+				init_node_data[numa_domain].node_start_pfn =
+					start / PAGE_SIZE;
+			if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
+			    init_node_data[numa_domain].node_end_pfn)
+				init_node_data[numa_domain].node_end_pfn =
+					(start / PAGE_SIZE) +
+					(size / PAGE_SIZE);
+
+			init_node_data[numa_domain].node_present_pages +=
+				size / PAGE_SIZE;
+		} else {
+			node_set_online(numa_domain);
+
+			init_node_data[numa_domain].node_start_pfn =
+				start / PAGE_SIZE;
+			init_node_data[numa_domain].node_end_pfn =
+				init_node_data[numa_domain].node_start_pfn +
+				size / PAGE_SIZE;
+			init_node_data[numa_domain].node_present_pages =
+				size / PAGE_SIZE;
+		}
+
+		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
+			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
+				numa_domain;
+
+		if (--ranges)
+			goto new_range;
+	}
+
+	for (i = 0; i <= max_domain; i++)
+		node_set_online(i);
+
+	return 0;
+}
+
+static void __init setup_nonnuma(void)
+{
+	unsigned long top_of_ram = lmb_end_of_DRAM();
+	unsigned long total_ram = lmb_phys_mem_size();
+	unsigned long i;
+
+	printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
+	       top_of_ram, total_ram);
+	printk(KERN_INFO "Memory hole size: %ldMB\n",
+	       (top_of_ram - total_ram) >> 20);
+
+	if (!numa_memory_lookup_table) {
+		long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
+		numa_memory_lookup_table =
+			(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
+		memset(numa_memory_lookup_table, 0, entries * sizeof(char));
+		for (i = 0; i < entries ; i++)
+			numa_memory_lookup_table[i] = ARRAY_INITIALISER;
+	}
+
+	map_cpu_to_node(boot_cpuid, 0);
+
+	node_set_online(0);
+
+	init_node_data[0].node_start_pfn = 0;
+	init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
+	init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
+
+	for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
+		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
+}
+
+static void __init dump_numa_topology(void)
+{
+	unsigned int node;
+	unsigned int count;
+
+	if (min_common_depth == -1 || !numa_enabled)
+		return;
+
+	for_each_online_node(node) {
+		unsigned long i;
+
+		printk(KERN_INFO "Node %d Memory:", node);
+
+		count = 0;
+
+		for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
+			if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
+				if (count == 0)
+					printk(" 0x%lx", i);
+				++count;
+			} else {
+				if (count > 0)
+					printk("-0x%lx", i);
+				count = 0;
+			}
+		}
+
+		if (count > 0)
+			printk("-0x%lx", i);
+		printk("\n");
+	}
+	return;
+}
+
+/*
+ * Allocate some memory, satisfying the lmb or bootmem allocator where
+ * required. nid is the preferred node and end is the physical address of
+ * the highest address in the node.
+ *
+ * Returns the physical address of the memory.
+ */
+static unsigned long careful_allocation(int nid, unsigned long size,
+					unsigned long align, unsigned long end)
+{
+	unsigned long ret = lmb_alloc_base(size, align, end);
+
+	/* retry over all memory */
+	if (!ret)
+		ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
+
+	if (!ret)
+		panic("numa.c: cannot allocate %lu bytes on node %d",
+		      size, nid);
+
+	/*
+	 * If the memory came from a previously allocated node, we must
+	 * retry with the bootmem allocator.
+	 */
+	if (pa_to_nid(ret) < nid) {
+		nid = pa_to_nid(ret);
+		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
+				size, align, 0);
+
+		if (!ret)
+			panic("numa.c: cannot allocate %lu bytes on node %d",
+			      size, nid);
+
+		ret = virt_to_abs(ret);
+
+		dbg("alloc_bootmem %lx %lx\n", ret, size);
+	}
+
+	return ret;
+}
+
+void __init do_init_bootmem(void)
+{
+	int nid;
+	int addr_cells, size_cells;
+	struct device_node *memory = NULL;
+	static struct notifier_block ppc64_numa_nb = {
+		.notifier_call = cpu_numa_callback,
+		.priority = 1 /* Must run before sched domains notifier. */
+	};
+
+	min_low_pfn = 0;
+	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	max_pfn = max_low_pfn;
+
+	if (parse_numa_properties())
+		setup_nonnuma();
+	else
+		dump_numa_topology();
+
+	register_cpu_notifier(&ppc64_numa_nb);
+
+	for_each_online_node(nid) {
+		unsigned long start_paddr, end_paddr;
+		int i;
+		unsigned long bootmem_paddr;
+		unsigned long bootmap_pages;
+
+		start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
+		end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
+
+		/* Allocate the node structure node local if possible */
+		NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
+					sizeof(struct pglist_data),
+					SMP_CACHE_BYTES, end_paddr);
+		NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
+		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+
+  		dbg("node %d\n", nid);
+		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
+
+		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+		NODE_DATA(nid)->node_start_pfn =
+			init_node_data[nid].node_start_pfn;
+		NODE_DATA(nid)->node_spanned_pages =
+			end_paddr - start_paddr;
+
+		if (NODE_DATA(nid)->node_spanned_pages == 0)
+  			continue;
+
+  		dbg("start_paddr = %lx\n", start_paddr);
+  		dbg("end_paddr = %lx\n", end_paddr);
+
+		bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
+
+		bootmem_paddr = careful_allocation(nid,
+				bootmap_pages << PAGE_SHIFT,
+				PAGE_SIZE, end_paddr);
+		memset(abs_to_virt(bootmem_paddr), 0,
+		       bootmap_pages << PAGE_SHIFT);
+		dbg("bootmap_paddr = %lx\n", bootmem_paddr);
+
+		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
+				  start_paddr >> PAGE_SHIFT,
+				  end_paddr >> PAGE_SHIFT);
+
+		/*
+		 * We need to do another scan of all memory sections to
+		 * associate memory with the correct node.
+		 */
+		addr_cells = get_mem_addr_cells();
+		size_cells = get_mem_size_cells();
+		memory = NULL;
+		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+			unsigned long mem_start, mem_size;
+			int numa_domain, ranges;
+			unsigned int *memcell_buf;
+			unsigned int len;
+
+			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+			if (!memcell_buf || len <= 0)
+				continue;
+
+			ranges = memory->n_addrs;	/* ranges in cell */
+new_range:
+			mem_start = read_n_cells(addr_cells, &memcell_buf);
+			mem_size = read_n_cells(size_cells, &memcell_buf);
+			if (numa_enabled) {
+				numa_domain = of_node_numa_domain(memory);
+				if (numa_domain  >= MAX_NUMNODES)
+					numa_domain = 0;
+			} else
+				numa_domain =  0;
+
+			if (numa_domain != nid)
+				continue;
+
+			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
+  			if (mem_size) {
+  				dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
+  				free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
+			}
+
+			if (--ranges)		/* process all ranges in cell */
+				goto new_range;
+		}
+
+		/*
+		 * Mark reserved regions on this node
+		 */
+		for (i = 0; i < lmb.reserved.cnt; i++) {
+			unsigned long physbase = lmb.reserved.region[i].base;
+			unsigned long size = lmb.reserved.region[i].size;
+
+			if (pa_to_nid(physbase) != nid &&
+			    pa_to_nid(physbase+size-1) != nid)
+				continue;
+
+			if (physbase < end_paddr &&
+			    (physbase+size) > start_paddr) {
+				/* overlaps */
+				if (physbase < start_paddr) {
+					size -= start_paddr - physbase;
+					physbase = start_paddr;
+				}
+
+				if (size > end_paddr - physbase)
+					size = end_paddr - physbase;
+
+				dbg("reserve_bootmem %lx %lx\n", physbase,
+				    size);
+				reserve_bootmem_node(NODE_DATA(nid), physbase,
+						     size);
+			}
+		}
+		/*
+		 * This loop may look famaliar, but we have to do it again
+		 * after marking our reserved memory to mark memory present
+		 * for sparsemem.
+		 */
+		addr_cells = get_mem_addr_cells();
+		size_cells = get_mem_size_cells();
+		memory = NULL;
+		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+			unsigned long mem_start, mem_size;
+			int numa_domain, ranges;
+			unsigned int *memcell_buf;
+			unsigned int len;
+
+			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+			if (!memcell_buf || len <= 0)
+				continue;
+
+			ranges = memory->n_addrs;	/* ranges in cell */
+new_range2:
+			mem_start = read_n_cells(addr_cells, &memcell_buf);
+			mem_size = read_n_cells(size_cells, &memcell_buf);
+			if (numa_enabled) {
+				numa_domain = of_node_numa_domain(memory);
+				if (numa_domain  >= MAX_NUMNODES)
+					numa_domain = 0;
+			} else
+				numa_domain =  0;
+
+			if (numa_domain != nid)
+				continue;
+
+			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
+			memory_present(numa_domain, mem_start >> PAGE_SHIFT,
+				       (mem_start + mem_size) >> PAGE_SHIFT);
+
+			if (--ranges)		/* process all ranges in cell */
+				goto new_range2;
+		}
+
+	}
+}
+
+void __init paging_init(void)
+{
+	unsigned long zones_size[MAX_NR_ZONES];
+	unsigned long zholes_size[MAX_NR_ZONES];
+	int nid;
+
+	memset(zones_size, 0, sizeof(zones_size));
+	memset(zholes_size, 0, sizeof(zholes_size));
+
+	for_each_online_node(nid) {
+		unsigned long start_pfn;
+		unsigned long end_pfn;
+
+		start_pfn = init_node_data[nid].node_start_pfn;
+		end_pfn = init_node_data[nid].node_end_pfn;
+
+		zones_size[ZONE_DMA] = end_pfn - start_pfn;
+		zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
+			init_node_data[nid].node_present_pages;
+
+		dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
+		    zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
+
+		free_area_init_node(nid, NODE_DATA(nid), zones_size,
+							start_pfn, zholes_size);
+	}
+}
+
+static int __init early_numa(char *p)
+{
+	if (!p)
+		return 0;
+
+	if (strstr(p, "off"))
+		numa_enabled = 0;
+
+	if (strstr(p, "debug"))
+		numa_debug = 1;
+
+	return 0;
+}
+early_param("numa", early_numa);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 724f97e5dee5..484d24f9208b 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -67,30 +67,9 @@
 #include <asm/vdso.h>
 #include <asm/imalloc.h>
 
-#if PGTABLE_RANGE > USER_VSID_RANGE
-#warning Limited user VSID range means pagetable space is wasted
-#endif
-
-#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
-#warning TASK_SIZE is smaller than it needs to be.
-#endif
-
-int mem_init_done;
 unsigned long ioremap_bot = IMALLOC_BASE;
 static unsigned long phbs_io_bot = PHBS_IO_BASE;
 
-extern pgd_t swapper_pg_dir[];
-extern struct task_struct *current_set[NR_CPUS];
-
-unsigned long klimit = (unsigned long)_end;
-
-/* max amount of RAM to use */
-unsigned long __max_memory;
-
-/* info on what we think the IO hole is */
-unsigned long 	io_hole_start;
-unsigned long	io_hole_size;
-
 #ifdef CONFIG_PPC_ISERIES
 
 void __iomem *ioremap(unsigned long addr, unsigned long size)
@@ -355,3 +334,16 @@ int iounmap_explicit(volatile void __iomem *start, unsigned long size)
 EXPORT_SYMBOL(ioremap);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
+
+void __iomem * reserve_phb_iospace(unsigned long size)
+{
+	void __iomem *virt_addr;
+		
+	if (phbs_io_bot >= IMALLOC_BASE) 
+		panic("reserve_phb_iospace(): phb io space overflow\n");
+			
+	virt_addr = (void __iomem *) phbs_io_bot;
+	phbs_io_bot += size;
+
+	return virt_addr;
+}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
new file mode 100644
index 000000000000..0473953f6a37
--- /dev/null
+++ b/arch/powerpc/mm/slb.c
@@ -0,0 +1,158 @@
+/*
+ * PowerPC64 SLB support.
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ * Based on earlier code writteh by:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/cputable.h>
+
+extern void slb_allocate(unsigned long ea);
+
+static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot)
+{
+	return (ea & ESID_MASK) | SLB_ESID_V | slot;
+}
+
+static inline unsigned long mk_vsid_data(unsigned long ea, unsigned long flags)
+{
+	return (get_kernel_vsid(ea) << SLB_VSID_SHIFT) | flags;
+}
+
+static inline void create_slbe(unsigned long ea, unsigned long flags,
+			       unsigned long entry)
+{
+	asm volatile("slbmte  %0,%1" :
+		     : "r" (mk_vsid_data(ea, flags)),
+		       "r" (mk_esid_data(ea, entry))
+		     : "memory" );
+}
+
+static void slb_flush_and_rebolt(void)
+{
+	/* If you change this make sure you change SLB_NUM_BOLTED
+	 * appropriately too. */
+	unsigned long ksp_flags = SLB_VSID_KERNEL;
+	unsigned long ksp_esid_data;
+
+	WARN_ON(!irqs_disabled());
+
+	if (cpu_has_feature(CPU_FTR_16M_PAGE))
+		ksp_flags |= SLB_VSID_L;
+
+	ksp_esid_data = mk_esid_data(get_paca()->kstack, 2);
+	if ((ksp_esid_data & ESID_MASK) == KERNELBASE)
+		ksp_esid_data &= ~SLB_ESID_V;
+
+	/* We need to do this all in asm, so we're sure we don't touch
+	 * the stack between the slbia and rebolting it. */
+	asm volatile("isync\n"
+		     "slbia\n"
+		     /* Slot 1 - first VMALLOC segment */
+		     "slbmte	%0,%1\n"
+		     /* Slot 2 - kernel stack */
+		     "slbmte	%2,%3\n"
+		     "isync"
+		     :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)),
+		        "r"(mk_esid_data(VMALLOCBASE, 1)),
+		        "r"(mk_vsid_data(ksp_esid_data, ksp_flags)),
+		        "r"(ksp_esid_data)
+		     : "memory");
+}
+
+/* Flush all user entries from the segment table of the current processor. */
+void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
+{
+	unsigned long offset = get_paca()->slb_cache_ptr;
+	unsigned long esid_data = 0;
+	unsigned long pc = KSTK_EIP(tsk);
+	unsigned long stack = KSTK_ESP(tsk);
+	unsigned long unmapped_base;
+
+	if (offset <= SLB_CACHE_ENTRIES) {
+		int i;
+		asm volatile("isync" : : : "memory");
+		for (i = 0; i < offset; i++) {
+			esid_data = ((unsigned long)get_paca()->slb_cache[i]
+				<< SID_SHIFT) | SLBIE_C;
+			asm volatile("slbie %0" : : "r" (esid_data));
+		}
+		asm volatile("isync" : : : "memory");
+	} else {
+		slb_flush_and_rebolt();
+	}
+
+	/* Workaround POWER5 < DD2.1 issue */
+	if (offset == 1 || offset > SLB_CACHE_ENTRIES)
+		asm volatile("slbie %0" : : "r" (esid_data));
+
+	get_paca()->slb_cache_ptr = 0;
+	get_paca()->context = mm->context;
+
+	/*
+	 * preload some userspace segments into the SLB.
+	 */
+	if (test_tsk_thread_flag(tsk, TIF_32BIT))
+		unmapped_base = TASK_UNMAPPED_BASE_USER32;
+	else
+		unmapped_base = TASK_UNMAPPED_BASE_USER64;
+
+	if (pc >= KERNELBASE)
+		return;
+	slb_allocate(pc);
+
+	if (GET_ESID(pc) == GET_ESID(stack))
+		return;
+
+	if (stack >= KERNELBASE)
+		return;
+	slb_allocate(stack);
+
+	if ((GET_ESID(pc) == GET_ESID(unmapped_base))
+	    || (GET_ESID(stack) == GET_ESID(unmapped_base)))
+		return;
+
+	if (unmapped_base >= KERNELBASE)
+		return;
+	slb_allocate(unmapped_base);
+}
+
+void slb_initialize(void)
+{
+	/* On iSeries the bolted entries have already been set up by
+	 * the hypervisor from the lparMap data in head.S */
+#ifndef CONFIG_PPC_ISERIES
+	unsigned long flags = SLB_VSID_KERNEL;
+
+ 	/* Invalidate the entire SLB (even slot 0) & all the ERATS */
+ 	if (cpu_has_feature(CPU_FTR_16M_PAGE))
+ 		flags |= SLB_VSID_L;
+
+ 	asm volatile("isync":::"memory");
+ 	asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
+	asm volatile("isync; slbia; isync":::"memory");
+	create_slbe(KERNELBASE, flags, 0);
+	create_slbe(VMALLOCBASE, SLB_VSID_KERNEL, 1);
+	/* We don't bolt the stack for the time being - we're in boot,
+	 * so the stack is in the bolted segment.  By the time it goes
+	 * elsewhere, we'll call _switch() which will bolt in the new
+	 * one. */
+	asm volatile("isync":::"memory");
+#endif
+
+	get_paca()->stab_rr = SLB_NUM_BOLTED;
+}
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
new file mode 100644
index 000000000000..a3a03da503bc
--- /dev/null
+++ b/arch/powerpc/mm/slb_low.S
@@ -0,0 +1,151 @@
+/*
+ * arch/ppc64/mm/slb_low.S
+ *
+ * Low-level SLB routines
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ *
+ * Based on earlier C version:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+
+/* void slb_allocate(unsigned long ea);
+ *
+ * Create an SLB entry for the given EA (user or kernel).
+ * 	r3 = faulting address, r13 = PACA
+ *	r9, r10, r11 are clobbered by this function
+ * No other registers are examined or changed.
+ */
+_GLOBAL(slb_allocate)
+	/*
+	 * First find a slot, round robin. Previously we tried to find
+	 * a free slot first but that took too long. Unfortunately we
+	 * dont have any LRU information to help us choose a slot.
+	 */
+#ifdef CONFIG_PPC_ISERIES
+	/*
+	 * On iSeries, the "bolted" stack segment can be cast out on
+	 * shared processor switch so we need to check for a miss on
+	 * it and restore it to the right slot.
+	 */
+	ld	r9,PACAKSAVE(r13)
+	clrrdi	r9,r9,28
+	clrrdi	r11,r3,28
+	li	r10,SLB_NUM_BOLTED-1	/* Stack goes in last bolted slot */
+	cmpld	r9,r11
+	beq	3f
+#endif /* CONFIG_PPC_ISERIES */
+
+	ld	r10,PACASTABRR(r13)
+	addi	r10,r10,1
+	/* use a cpu feature mask if we ever change our slb size */
+	cmpldi	r10,SLB_NUM_ENTRIES
+
+	blt+	4f
+	li	r10,SLB_NUM_BOLTED
+
+4:
+	std	r10,PACASTABRR(r13)
+3:
+	/* r3 = faulting address, r10 = entry */
+
+	srdi	r9,r3,60		/* get region */
+	srdi	r3,r3,28		/* get esid */
+	cmpldi	cr7,r9,0xc		/* cmp KERNELBASE for later use */
+
+	rldimi	r10,r3,28,0		/* r10= ESID<<28 | entry */
+	oris	r10,r10,SLB_ESID_V@h	/* r10 |= SLB_ESID_V */
+
+	/* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */
+
+	blt	cr7,0f			/* user or kernel? */
+
+	/* kernel address: proto-VSID = ESID */
+	/* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
+	 * this code will generate the protoVSID 0xfffffffff for the
+	 * top segment.  That's ok, the scramble below will translate
+	 * it to VSID 0, which is reserved as a bad VSID - one which
+	 * will never have any pages in it.  */
+	li	r11,SLB_VSID_KERNEL
+BEGIN_FTR_SECTION
+	bne	cr7,9f
+	li	r11,(SLB_VSID_KERNEL|SLB_VSID_L)
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+	b	9f
+
+0:	/* user address: proto-VSID = context<<15 | ESID */
+	srdi.	r9,r3,USER_ESID_BITS
+	bne-	8f			/* invalid ea bits set */
+
+#ifdef CONFIG_HUGETLB_PAGE
+BEGIN_FTR_SECTION
+	lhz	r9,PACAHIGHHTLBAREAS(r13)
+	srdi	r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
+	srd	r9,r9,r11
+	lhz	r11,PACALOWHTLBAREAS(r13)
+	srd	r11,r11,r3
+	or	r9,r9,r11
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+#endif /* CONFIG_HUGETLB_PAGE */
+
+	li	r11,SLB_VSID_USER
+
+#ifdef CONFIG_HUGETLB_PAGE
+BEGIN_FTR_SECTION
+	rldimi	r11,r9,8,55		/* shift masked bit into SLB_VSID_L */
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+#endif /* CONFIG_HUGETLB_PAGE */
+
+	ld	r9,PACACONTEXTID(r13)
+	rldimi	r3,r9,USER_ESID_BITS,0
+
+9:	/* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
+	ASM_VSID_SCRAMBLE(r3,r9)
+
+	rldimi	r11,r3,SLB_VSID_SHIFT,16	/* combine VSID and flags */
+
+	/*
+	 * No need for an isync before or after this slbmte. The exception
+	 * we enter with and the rfid we exit with are context synchronizing.
+	 */
+	slbmte	r11,r10
+
+	bgelr	cr7			/* we're done for kernel addresses */
+
+	/* Update the slb cache */
+	lhz	r3,PACASLBCACHEPTR(r13)	/* offset = paca->slb_cache_ptr */
+	cmpldi	r3,SLB_CACHE_ENTRIES
+	bge	1f
+
+	/* still room in the slb cache */
+	sldi	r11,r3,1		/* r11 = offset * sizeof(u16) */
+	rldicl	r10,r10,36,28		/* get low 16 bits of the ESID */
+	add	r11,r11,r13		/* r11 = (u16 *)paca + offset */
+	sth	r10,PACASLBCACHE(r11)	/* paca->slb_cache[offset] = esid */
+	addi	r3,r3,1			/* offset++ */
+	b	2f
+1:					/* offset >= SLB_CACHE_ENTRIES */
+	li	r3,SLB_CACHE_ENTRIES+1
+2:
+	sth	r3,PACASLBCACHEPTR(r13)	/* paca->slb_cache_ptr = offset */
+	blr
+
+8:	/* invalid EA */
+	li	r3,0			/* BAD_VSID */
+	li	r11,SLB_VSID_USER	/* flags don't much matter */
+	b	9b
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
new file mode 100644
index 000000000000..1b83f002bf27
--- /dev/null
+++ b/arch/powerpc/mm/stab.c
@@ -0,0 +1,279 @@
+/*
+ * PowerPC64 Segment Translation Support.
+ *
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ *
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/cputable.h>
+#include <asm/lmb.h>
+#include <asm/abs_addr.h>
+
+struct stab_entry {
+	unsigned long esid_data;
+	unsigned long vsid_data;
+};
+
+/* Both the segment table and SLB code uses the following cache */
+#define NR_STAB_CACHE_ENTRIES 8
+DEFINE_PER_CPU(long, stab_cache_ptr);
+DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
+
+/*
+ * Create a segment table entry for the given esid/vsid pair.
+ */
+static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid)
+{
+	unsigned long esid_data, vsid_data;
+	unsigned long entry, group, old_esid, castout_entry, i;
+	unsigned int global_entry;
+	struct stab_entry *ste, *castout_ste;
+	unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE;
+
+	vsid_data = vsid << STE_VSID_SHIFT;
+	esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V;
+	if (! kernel_segment)
+		esid_data |= STE_ESID_KS;
+
+	/* Search the primary group first. */
+	global_entry = (esid & 0x1f) << 3;
+	ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
+
+	/* Find an empty entry, if one exists. */
+	for (group = 0; group < 2; group++) {
+		for (entry = 0; entry < 8; entry++, ste++) {
+			if (!(ste->esid_data & STE_ESID_V)) {
+				ste->vsid_data = vsid_data;
+				asm volatile("eieio":::"memory");
+				ste->esid_data = esid_data;
+				return (global_entry | entry);
+			}
+		}
+		/* Now search the secondary group. */
+		global_entry = ((~esid) & 0x1f) << 3;
+		ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
+	}
+
+	/*
+	 * Could not find empty entry, pick one with a round robin selection.
+	 * Search all entries in the two groups.
+	 */
+	castout_entry = get_paca()->stab_rr;
+	for (i = 0; i < 16; i++) {
+		if (castout_entry < 8) {
+			global_entry = (esid & 0x1f) << 3;
+			ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
+			castout_ste = ste + castout_entry;
+		} else {
+			global_entry = ((~esid) & 0x1f) << 3;
+			ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
+			castout_ste = ste + (castout_entry - 8);
+		}
+
+		/* Dont cast out the first kernel segment */
+		if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE)
+			break;
+
+		castout_entry = (castout_entry + 1) & 0xf;
+	}
+
+	get_paca()->stab_rr = (castout_entry + 1) & 0xf;
+
+	/* Modify the old entry to the new value. */
+
+	/* Force previous translations to complete. DRENG */
+	asm volatile("isync" : : : "memory");
+
+	old_esid = castout_ste->esid_data >> SID_SHIFT;
+	castout_ste->esid_data = 0;		/* Invalidate old entry */
+
+	asm volatile("sync" : : : "memory");    /* Order update */
+
+	castout_ste->vsid_data = vsid_data;
+	asm volatile("eieio" : : : "memory");   /* Order update */
+	castout_ste->esid_data = esid_data;
+
+	asm volatile("slbie  %0" : : "r" (old_esid << SID_SHIFT));
+	/* Ensure completion of slbie */
+	asm volatile("sync" : : : "memory");
+
+	return (global_entry | (castout_entry & 0x7));
+}
+
+/*
+ * Allocate a segment table entry for the given ea and mm
+ */
+static int __ste_allocate(unsigned long ea, struct mm_struct *mm)
+{
+	unsigned long vsid;
+	unsigned char stab_entry;
+	unsigned long offset;
+
+	/* Kernel or user address? */
+	if (ea >= KERNELBASE) {
+		vsid = get_kernel_vsid(ea);
+	} else {
+		if ((ea >= TASK_SIZE_USER64) || (! mm))
+			return 1;
+
+		vsid = get_vsid(mm->context.id, ea);
+	}
+
+	stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid);
+
+	if (ea < KERNELBASE) {
+		offset = __get_cpu_var(stab_cache_ptr);
+		if (offset < NR_STAB_CACHE_ENTRIES)
+			__get_cpu_var(stab_cache[offset++]) = stab_entry;
+		else
+			offset = NR_STAB_CACHE_ENTRIES+1;
+		__get_cpu_var(stab_cache_ptr) = offset;
+
+		/* Order update */
+		asm volatile("sync":::"memory");
+	}
+
+	return 0;
+}
+
+int ste_allocate(unsigned long ea)
+{
+	return __ste_allocate(ea, current->mm);
+}
+
+/*
+ * Do the segment table work for a context switch: flush all user
+ * entries from the table, then preload some probably useful entries
+ * for the new task
+ */
+void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
+	struct stab_entry *ste;
+	unsigned long offset = __get_cpu_var(stab_cache_ptr);
+	unsigned long pc = KSTK_EIP(tsk);
+	unsigned long stack = KSTK_ESP(tsk);
+	unsigned long unmapped_base;
+
+	/* Force previous translations to complete. DRENG */
+	asm volatile("isync" : : : "memory");
+
+	if (offset <= NR_STAB_CACHE_ENTRIES) {
+		int i;
+
+		for (i = 0; i < offset; i++) {
+			ste = stab + __get_cpu_var(stab_cache[i]);
+			ste->esid_data = 0; /* invalidate entry */
+		}
+	} else {
+		unsigned long entry;
+
+		/* Invalidate all entries. */
+		ste = stab;
+
+		/* Never flush the first entry. */
+		ste += 1;
+		for (entry = 1;
+		     entry < (PAGE_SIZE / sizeof(struct stab_entry));
+		     entry++, ste++) {
+			unsigned long ea;
+			ea = ste->esid_data & ESID_MASK;
+			if (ea < KERNELBASE) {
+				ste->esid_data = 0;
+			}
+		}
+	}
+
+	asm volatile("sync; slbia; sync":::"memory");
+
+	__get_cpu_var(stab_cache_ptr) = 0;
+
+	/* Now preload some entries for the new task */
+	if (test_tsk_thread_flag(tsk, TIF_32BIT))
+		unmapped_base = TASK_UNMAPPED_BASE_USER32;
+	else
+		unmapped_base = TASK_UNMAPPED_BASE_USER64;
+
+	__ste_allocate(pc, mm);
+
+	if (GET_ESID(pc) == GET_ESID(stack))
+		return;
+
+	__ste_allocate(stack, mm);
+
+	if ((GET_ESID(pc) == GET_ESID(unmapped_base))
+	    || (GET_ESID(stack) == GET_ESID(unmapped_base)))
+		return;
+
+	__ste_allocate(unmapped_base, mm);
+
+	/* Order update */
+	asm volatile("sync" : : : "memory");
+}
+
+extern void slb_initialize(void);
+
+/*
+ * Allocate segment tables for secondary CPUs.  These must all go in
+ * the first (bolted) segment, so that do_stab_bolted won't get a
+ * recursive segment miss on the segment table itself.
+ */
+void stabs_alloc(void)
+{
+	int cpu;
+
+	if (cpu_has_feature(CPU_FTR_SLB))
+		return;
+
+	for_each_cpu(cpu) {
+		unsigned long newstab;
+
+		if (cpu == 0)
+			continue; /* stab for CPU 0 is statically allocated */
+
+		newstab = lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, 1<<SID_SHIFT);
+		if (! newstab)
+			panic("Unable to allocate segment table for CPU %d.\n",
+			      cpu);
+
+		newstab += KERNELBASE;
+
+		memset((void *)newstab, 0, PAGE_SIZE);
+
+		paca[cpu].stab_addr = newstab;
+		paca[cpu].stab_real = virt_to_abs(newstab);
+		printk(KERN_DEBUG "Segment table for CPU %d at 0x%lx virtual, 0x%lx absolute\n", cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
+	}
+}
+
+/*
+ * Build an entry for the base kernel segment and put it into
+ * the segment table or SLB.  All other segment table or SLB
+ * entries are faulted in.
+ */
+void stab_initialize(unsigned long stab)
+{
+	unsigned long vsid = get_kernel_vsid(KERNELBASE);
+
+	if (cpu_has_feature(CPU_FTR_SLB)) {
+		slb_initialize();
+	} else {
+		asm volatile("isync; slbia; isync":::"memory");
+		make_ste(stab, GET_ESID(KERNELBASE), vsid);
+
+		/* Order update */
+		asm volatile("sync":::"memory");
+	}
+}
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
new file mode 100644
index 000000000000..09ab81a10f4f
--- /dev/null
+++ b/arch/powerpc/mm/tlb_64.c
@@ -0,0 +1,196 @@
+/*
+ * This file contains the routines for flushing entries from the
+ * TLB and MMU hash table.
+ *
+ *  Derived from arch/ppc64/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *  Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <linux/highmem.h>
+
+DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
+
+/* This is declared as we are using the more or less generic
+ * include/asm-ppc64/tlb.h file -- tgall
+ */
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+
+struct pte_freelist_batch
+{
+	struct rcu_head	rcu;
+	unsigned int	index;
+	pgtable_free_t	tables[0];
+};
+
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+
+#define PTE_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
+	  / sizeof(pgtable_free_t))
+
+#ifdef CONFIG_SMP
+static void pte_free_smp_sync(void *arg)
+{
+	/* Do nothing, just ensure we sync with all CPUs */
+}
+#endif
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+static void pgtable_free_now(pgtable_free_t pgf)
+{
+	pte_freelist_forced_free++;
+
+	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+	pgtable_free(pgf);
+}
+
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+	struct pte_freelist_batch *batch =
+		container_of(head, struct pte_freelist_batch, rcu);
+	unsigned int i;
+
+	for (i = 0; i < batch->index; i++)
+		pgtable_free(batch->tables[i]);
+
+	free_page((unsigned long)batch);
+}
+
+static void pte_free_submit(struct pte_freelist_batch *batch)
+{
+	INIT_RCU_HEAD(&batch->rcu);
+	call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+{
+	/* This is safe as we are holding page_table_lock */
+        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
+	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+	if (atomic_read(&tlb->mm->mm_users) < 2 ||
+	    cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
+		pgtable_free(pgf);
+		return;
+	}
+
+	if (*batchp == NULL) {
+		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+		if (*batchp == NULL) {
+			pgtable_free_now(pgf);
+			return;
+		}
+		(*batchp)->index = 0;
+	}
+	(*batchp)->tables[(*batchp)->index++] = pgf;
+	if ((*batchp)->index == PTE_FREELIST_SIZE) {
+		pte_free_submit(*batchp);
+		*batchp = NULL;
+	}
+}
+
+/*
+ * Update the MMU hash table to correspond with a change to
+ * a Linux PTE.  If wrprot is true, it is permissible to
+ * change the existing HPTE to read-only rather than removing it
+ * (if we remove it we should clear the _PTE_HPTEFLAGS bits).
+ */
+void hpte_update(struct mm_struct *mm, unsigned long addr,
+		 unsigned long pte, int wrprot)
+{
+	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	unsigned long vsid;
+	int i;
+
+	i = batch->index;
+
+	/*
+	 * This can happen when we are in the middle of a TLB batch and
+	 * we encounter memory pressure (eg copy_page_range when it tries
+	 * to allocate a new pte). If we have to reclaim memory and end
+	 * up scanning and resetting referenced bits then our batch context
+	 * will change mid stream.
+	 */
+	if (i != 0 && (mm != batch->mm || batch->large != pte_huge(pte))) {
+		flush_tlb_pending();
+		i = 0;
+	}
+	if (i == 0) {
+		batch->mm = mm;
+		batch->large = pte_huge(pte);
+	}
+	if (addr < KERNELBASE) {
+		vsid = get_vsid(mm->context.id, addr);
+		WARN_ON(vsid == 0);
+	} else
+		vsid = get_kernel_vsid(addr);
+	batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff);
+	batch->pte[i] = __pte(pte);
+	batch->index = ++i;
+	if (i >= PPC64_TLB_BATCH_NR)
+		flush_tlb_pending();
+}
+
+void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
+{
+	int i;
+	int cpu;
+	cpumask_t tmp;
+	int local = 0;
+
+	BUG_ON(in_interrupt());
+
+	cpu = get_cpu();
+	i = batch->index;
+	tmp = cpumask_of_cpu(cpu);
+	if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
+		local = 1;
+
+	if (i == 1)
+		flush_hash_page(batch->vaddr[0], batch->pte[0], local);
+	else
+		flush_hash_range(i, local);
+	batch->index = 0;
+	put_cpu();
+}
+
+void pte_free_finish(void)
+{
+	/* This is safe as we are holding page_table_lock */
+	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+	if (*batchp == NULL)
+		return;
+	pte_free_submit(*batchp);
+	*batchp = NULL;
+}