summaryrefslogtreecommitdiffstats
path: root/arch/ia64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/ia64/mm')
-rw-r--r--arch/ia64/mm/Makefile12
-rw-r--r--arch/ia64/mm/contig.c299
-rw-r--r--arch/ia64/mm/discontig.c737
-rw-r--r--arch/ia64/mm/extable.c90
-rw-r--r--arch/ia64/mm/fault.c261
-rw-r--r--arch/ia64/mm/hugetlbpage.c357
-rw-r--r--arch/ia64/mm/init.c597
-rw-r--r--arch/ia64/mm/numa.c49
-rw-r--r--arch/ia64/mm/tlb.c190
9 files changed, 2592 insertions, 0 deletions
diff --git a/arch/ia64/mm/Makefile b/arch/ia64/mm/Makefile
new file mode 100644
index 000000000000..7078f67887ec
--- /dev/null
+++ b/arch/ia64/mm/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the ia64-specific parts of the memory manager.
+#
+
+obj-y := init.o fault.o tlb.o extable.o
+
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_NUMA) += numa.o
+obj-$(CONFIG_DISCONTIGMEM) += discontig.o
+ifndef CONFIG_DISCONTIGMEM
+obj-y += contig.o
+endif
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
new file mode 100644
index 000000000000..6daf15ac8940
--- /dev/null
+++ b/arch/ia64/mm/contig.c
@@ -0,0 +1,299 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ * Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com>
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 2003 Silicon Graphics, Inc. All rights reserved.
+ *
+ * Routines used by ia64 machines with contiguous (or virtually contiguous)
+ * memory.
+ */
+#include <linux/config.h>
+#include <linux/bootmem.h>
+#include <linux/efi.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+
+#include <asm/meminit.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/mca.h>
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+static unsigned long num_dma_physpages;
+#endif
+
+/**
+ * show_mem - display a memory statistics summary
+ *
+ * Just walks the pages in the system and describes where they're allocated.
+ */
+void
+show_mem (void)
+{
+ int i, total = 0, reserved = 0;
+ int shared = 0, cached = 0;
+
+ printk("Mem-info:\n");
+ show_free_areas();
+
+ printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+ i = max_mapnr;
+ while (i-- > 0) {
+ if (!pfn_valid(i))
+ continue;
+ total++;
+ if (PageReserved(mem_map+i))
+ reserved++;
+ else if (PageSwapCache(mem_map+i))
+ cached++;
+ else if (page_count(mem_map + i))
+ shared += page_count(mem_map + i) - 1;
+ }
+ printk("%d pages of RAM\n", total);
+ printk("%d reserved pages\n", reserved);
+ printk("%d pages shared\n", shared);
+ printk("%d pages swap cached\n", cached);
+ printk("%ld pages in page table cache\n", pgtable_cache_size);
+}
+
+/* physical address where the bootmem map is located */
+unsigned long bootmap_start;
+
+/**
+ * find_max_pfn - adjust the maximum page number callback
+ * @start: start of range
+ * @end: end of range
+ * @arg: address of pointer to global max_pfn variable
+ *
+ * Passed as a callback function to efi_memmap_walk() to determine the highest
+ * available page frame number in the system.
+ */
+int
+find_max_pfn (unsigned long start, unsigned long end, void *arg)
+{
+ unsigned long *max_pfnp = arg, pfn;
+
+ pfn = (PAGE_ALIGN(end - 1) - PAGE_OFFSET) >> PAGE_SHIFT;
+ if (pfn > *max_pfnp)
+ *max_pfnp = pfn;
+ return 0;
+}
+
+/**
+ * find_bootmap_location - callback to find a memory area for the bootmap
+ * @start: start of region
+ * @end: end of region
+ * @arg: unused callback data
+ *
+ * Find a place to put the bootmap and return its starting address in
+ * bootmap_start. This address must be page-aligned.
+ */
+int
+find_bootmap_location (unsigned long start, unsigned long end, void *arg)
+{
+ unsigned long needed = *(unsigned long *)arg;
+ unsigned long range_start, range_end, free_start;
+ int i;
+
+#if IGNORE_PFN0
+ if (start == PAGE_OFFSET) {
+ start += PAGE_SIZE;
+ if (start >= end)
+ return 0;
+ }
+#endif
+
+ free_start = PAGE_OFFSET;
+
+ for (i = 0; i < num_rsvd_regions; i++) {
+ range_start = max(start, free_start);
+ range_end = min(end, rsvd_region[i].start & PAGE_MASK);
+
+ free_start = PAGE_ALIGN(rsvd_region[i].end);
+
+ if (range_end <= range_start)
+ continue; /* skip over empty range */
+
+ if (range_end - range_start >= needed) {
+ bootmap_start = __pa(range_start);
+ return -1; /* done */
+ }
+
+ /* nothing more available in this segment */
+ if (range_end == end)
+ return 0;
+ }
+ return 0;
+}
+
+/**
+ * find_memory - setup memory map
+ *
+ * Walk the EFI memory map and find usable memory for the system, taking
+ * into account reserved areas.
+ */
+void
+find_memory (void)
+{
+ unsigned long bootmap_size;
+
+ reserve_memory();
+
+ /* first find highest page frame number */
+ max_pfn = 0;
+ efi_memmap_walk(find_max_pfn, &max_pfn);
+
+ /* how many bytes to cover all the pages */
+ bootmap_size = bootmem_bootmap_pages(max_pfn) << PAGE_SHIFT;
+
+ /* look for a location to hold the bootmap */
+ bootmap_start = ~0UL;
+ efi_memmap_walk(find_bootmap_location, &bootmap_size);
+ if (bootmap_start == ~0UL)
+ panic("Cannot find %ld bytes for bootmap\n", bootmap_size);
+
+ bootmap_size = init_bootmem(bootmap_start >> PAGE_SHIFT, max_pfn);
+
+ /* Free all available memory, then mark bootmem-map as being in use. */
+ efi_memmap_walk(filter_rsvd_memory, free_bootmem);
+ reserve_bootmem(bootmap_start, bootmap_size);
+
+ find_initrd();
+}
+
+#ifdef CONFIG_SMP
+/**
+ * per_cpu_init - setup per-cpu variables
+ *
+ * Allocate and setup per-cpu data areas.
+ */
+void *
+per_cpu_init (void)
+{
+ void *cpu_data;
+ int cpu;
+
+ /*
+ * get_free_pages() cannot be used before cpu_init() done. BSP
+ * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls
+ * get_zeroed_page().
+ */
+ if (smp_processor_id() == 0) {
+ cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS,
+ PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start);
+ __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start;
+ cpu_data += PERCPU_PAGE_SIZE;
+ per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
+ }
+ }
+ return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
+}
+#endif /* CONFIG_SMP */
+
+static int
+count_pages (u64 start, u64 end, void *arg)
+{
+ unsigned long *count = arg;
+
+ *count += (end - start) >> PAGE_SHIFT;
+ return 0;
+}
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+static int
+count_dma_pages (u64 start, u64 end, void *arg)
+{
+ unsigned long *count = arg;
+
+ if (start < MAX_DMA_ADDRESS)
+ *count += (min(end, MAX_DMA_ADDRESS) - start) >> PAGE_SHIFT;
+ return 0;
+}
+#endif
+
+/*
+ * Set up the page tables.
+ */
+
+void
+paging_init (void)
+{
+ unsigned long max_dma;
+ unsigned long zones_size[MAX_NR_ZONES];
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+ unsigned long zholes_size[MAX_NR_ZONES];
+ unsigned long max_gap;
+#endif
+
+ /* initialize mem_map[] */
+
+ memset(zones_size, 0, sizeof(zones_size));
+
+ num_physpages = 0;
+ efi_memmap_walk(count_pages, &num_physpages);
+
+ max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+ memset(zholes_size, 0, sizeof(zholes_size));
+
+ num_dma_physpages = 0;
+ efi_memmap_walk(count_dma_pages, &num_dma_physpages);
+
+ if (max_low_pfn < max_dma) {
+ zones_size[ZONE_DMA] = max_low_pfn;
+ zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages;
+ } else {
+ zones_size[ZONE_DMA] = max_dma;
+ zholes_size[ZONE_DMA] = max_dma - num_dma_physpages;
+ if (num_physpages > num_dma_physpages) {
+ zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
+ zholes_size[ZONE_NORMAL] =
+ ((max_low_pfn - max_dma) -
+ (num_physpages - num_dma_physpages));
+ }
+ }
+
+ max_gap = 0;
+ efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
+ if (max_gap < LARGE_GAP) {
+ vmem_map = (struct page *) 0;
+ free_area_init_node(0, &contig_page_data, zones_size, 0,
+ zholes_size);
+ } else {
+ unsigned long map_size;
+
+ /* allocate virtual_mem_map */
+
+ map_size = PAGE_ALIGN(max_low_pfn * sizeof(struct page));
+ vmalloc_end -= map_size;
+ vmem_map = (struct page *) vmalloc_end;
+ efi_memmap_walk(create_mem_map_page_table, NULL);
+
+ NODE_DATA(0)->node_mem_map = vmem_map;
+ free_area_init_node(0, &contig_page_data, zones_size,
+ 0, zholes_size);
+
+ printk("Virtual mem_map starts at 0x%p\n", mem_map);
+ }
+#else /* !CONFIG_VIRTUAL_MEM_MAP */
+ if (max_low_pfn < max_dma)
+ zones_size[ZONE_DMA] = max_low_pfn;
+ else {
+ zones_size[ZONE_DMA] = max_dma;
+ zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
+ }
+ free_area_init(zones_size);
+#endif /* !CONFIG_VIRTUAL_MEM_MAP */
+ zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
+}
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
new file mode 100644
index 000000000000..3456a9b6971e
--- /dev/null
+++ b/arch/ia64/mm/discontig.c
@@ -0,0 +1,737 @@
+/*
+ * Copyright (c) 2000, 2003 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
+ * Copyright (c) 2002 NEC Corp.
+ * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
+ * Copyright (c) 2004 Silicon Graphics, Inc
+ * Russ Anderson <rja@sgi.com>
+ * Jesse Barnes <jbarnes@sgi.com>
+ * Jack Steiner <steiner@sgi.com>
+ */
+
+/*
+ * Platform initialization for Discontig Memory
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+#include <linux/nodemask.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/meminit.h>
+#include <asm/numa.h>
+#include <asm/sections.h>
+
+/*
+ * Track per-node information needed to setup the boot memory allocator, the
+ * per-node areas, and the real VM.
+ */
+struct early_node_data {
+ struct ia64_node_data *node_data;
+ pg_data_t *pgdat;
+ unsigned long pernode_addr;
+ unsigned long pernode_size;
+ struct bootmem_data bootmem_data;
+ unsigned long num_physpages;
+ unsigned long num_dma_physpages;
+ unsigned long min_pfn;
+ unsigned long max_pfn;
+};
+
+static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
+
+/**
+ * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node
+ *
+ * This function will move nodes with only CPUs (no memory)
+ * to a node with memory which is at the minimum numa_slit distance.
+ * Any reassigments will result in the compression of the nodes
+ * and renumbering the nid values where appropriate.
+ * The static declarations below are to avoid large stack size which
+ * makes the code not re-entrant.
+ */
+static void __init reassign_cpu_only_nodes(void)
+{
+ struct node_memblk_s *p;
+ int i, j, k, nnode, nid, cpu, cpunid, pxm;
+ u8 cslit, slit;
+ static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata;
+ static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata;
+ static int node_flip[MAX_NUMNODES] __initdata;
+ static int old_nid_map[NR_CPUS] __initdata;
+
+ for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
+ if (!test_bit(p->nid, (void *) nodes_with_mem)) {
+ set_bit(p->nid, (void *) nodes_with_mem);
+ nnode++;
+ }
+
+ /*
+ * All nids with memory.
+ */
+ if (nnode == num_online_nodes())
+ return;
+
+ /*
+ * Change nids and attempt to migrate CPU-only nodes
+ * to the best numa_slit (closest neighbor) possible.
+ * For reassigned CPU nodes a nid can't be arrived at
+ * until after this loop because the target nid's new
+ * identity might not have been established yet. So
+ * new nid values are fabricated above num_online_nodes() and
+ * mapped back later to their true value.
+ */
+ /* MCD - This code is a bit complicated, but may be unnecessary now.
+ * We can now handle much more interesting node-numbering.
+ * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES
+ * and that there be no holes in the numbering 0..numnodes
+ * has become simply 0 <= nid <= MAX_NUMNODES.
+ */
+ nid = 0;
+ for_each_online_node(i) {
+ if (test_bit(i, (void *) nodes_with_mem)) {
+ /*
+ * Save original nid value for numa_slit
+ * fixup and node_cpuid reassignments.
+ */
+ node_flip[nid] = i;
+
+ if (i == nid) {
+ nid++;
+ continue;
+ }
+
+ for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
+ if (p->nid == i)
+ p->nid = nid;
+
+ cpunid = nid;
+ nid++;
+ } else
+ cpunid = MAX_NUMNODES;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (node_cpuid[cpu].nid == i) {
+ /*
+ * For nodes not being reassigned just
+ * fix the cpu's nid and reverse pxm map
+ */
+ if (cpunid < MAX_NUMNODES) {
+ pxm = nid_to_pxm_map[i];
+ pxm_to_nid_map[pxm] =
+ node_cpuid[cpu].nid = cpunid;
+ continue;
+ }
+
+ /*
+ * For nodes being reassigned, find best node by
+ * numa_slit information and then make a temporary
+ * nid value based on current nid and num_online_nodes().
+ */
+ slit = 0xff;
+ k = 2*num_online_nodes();
+ for_each_online_node(j) {
+ if (i == j)
+ continue;
+ else if (test_bit(j, (void *) nodes_with_mem)) {
+ cslit = numa_slit[i * num_online_nodes() + j];
+ if (cslit < slit) {
+ k = num_online_nodes() + j;
+ slit = cslit;
+ }
+ }
+ }
+
+ /* save old nid map so we can update the pxm */
+ old_nid_map[cpu] = node_cpuid[cpu].nid;
+ node_cpuid[cpu].nid = k;
+ }
+ }
+
+ /*
+ * Fixup temporary nid values for CPU-only nodes.
+ */
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (node_cpuid[cpu].nid == (2*num_online_nodes())) {
+ pxm = nid_to_pxm_map[old_nid_map[cpu]];
+ pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1;
+ } else {
+ for (i = 0; i < nnode; i++) {
+ if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes()))
+ continue;
+
+ pxm = nid_to_pxm_map[old_nid_map[cpu]];
+ pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i;
+ break;
+ }
+ }
+
+ /*
+ * Fix numa_slit by compressing from larger
+ * nid array to reduced nid array.
+ */
+ for (i = 0; i < nnode; i++)
+ for (j = 0; j < nnode; j++)
+ numa_slit_fix[i * nnode + j] =
+ numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]];
+
+ memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit));
+
+ nodes_clear(node_online_map);
+ for (i = 0; i < nnode; i++)
+ node_set_online(i);
+
+ return;
+}
+
+/*
+ * To prevent cache aliasing effects, align per-node structures so that they
+ * start at addresses that are strided by node number.
+ */
+#define NODEDATA_ALIGN(addr, node) \
+ ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE)
+
+/**
+ * build_node_maps - callback to setup bootmem structs for each node
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * We allocate a struct bootmem_data for each piece of memory that we wish to
+ * treat as a virtually contiguous block (i.e. each node). Each such block
+ * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
+ * if necessary. Any non-existent pages will simply be part of the virtual
+ * memmap. We also update min_low_pfn and max_low_pfn here as we receive
+ * memory ranges from the caller.
+ */
+static int __init build_node_maps(unsigned long start, unsigned long len,
+ int node)
+{
+ unsigned long cstart, epfn, end = start + len;
+ struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+
+ epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
+ cstart = GRANULEROUNDDOWN(start);
+
+ if (!bdp->node_low_pfn) {
+ bdp->node_boot_start = cstart;
+ bdp->node_low_pfn = epfn;
+ } else {
+ bdp->node_boot_start = min(cstart, bdp->node_boot_start);
+ bdp->node_low_pfn = max(epfn, bdp->node_low_pfn);
+ }
+
+ min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT);
+ max_low_pfn = max(max_low_pfn, bdp->node_low_pfn);
+
+ return 0;
+}
+
+/**
+ * early_nr_phys_cpus_node - return number of physical cpus on a given node
+ * @node: node to check
+ *
+ * Count the number of physical cpus on @node. These are cpus that actually
+ * exist. We can't use nr_cpus_node() yet because
+ * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
+ * called yet.
+ */
+static int early_nr_phys_cpus_node(int node)
+{
+ int cpu, n = 0;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (node == node_cpuid[cpu].nid)
+ if ((cpu == 0) || node_cpuid[cpu].phys_id)
+ n++;
+
+ return n;
+}
+
+
+/**
+ * early_nr_cpus_node - return number of cpus on a given node
+ * @node: node to check
+ *
+ * Count the number of cpus on @node. We can't use nr_cpus_node() yet because
+ * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
+ * called yet. Note that node 0 will also count all non-existent cpus.
+ */
+static int early_nr_cpus_node(int node)
+{
+ int cpu, n = 0;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (node == node_cpuid[cpu].nid)
+ n++;
+
+ return n;
+}
+
+/**
+ * find_pernode_space - allocate memory for memory map and per-node structures
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * This routine reserves space for the per-cpu data struct, the list of
+ * pg_data_ts and the per-node data struct. Each node will have something like
+ * the following in the first chunk of addr. space large enough to hold it.
+ *
+ * ________________________
+ * | |
+ * |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
+ * | PERCPU_PAGE_SIZE * | start and length big enough
+ * | cpus_on_this_node | Node 0 will also have entries for all non-existent cpus.
+ * |------------------------|
+ * | local pg_data_t * |
+ * |------------------------|
+ * | local ia64_node_data |
+ * |------------------------|
+ * | ??? |
+ * |________________________|
+ *
+ * Once this space has been set aside, the bootmem maps are initialized. We
+ * could probably move the allocation of the per-cpu and ia64_node_data space
+ * outside of this function and use alloc_bootmem_node(), but doing it here
+ * is straightforward and we get the alignments we want so...
+ */
+static int __init find_pernode_space(unsigned long start, unsigned long len,
+ int node)
+{
+ unsigned long epfn, cpu, cpus, phys_cpus;
+ unsigned long pernodesize = 0, pernode, pages, mapsize;
+ void *cpu_data;
+ struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+
+ epfn = (start + len) >> PAGE_SHIFT;
+
+ pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT);
+ mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
+
+ /*
+ * Make sure this memory falls within this node's usable memory
+ * since we may have thrown some away in build_maps().
+ */
+ if (start < bdp->node_boot_start || epfn > bdp->node_low_pfn)
+ return 0;
+
+ /* Don't setup this node's local space twice... */
+ if (mem_data[node].pernode_addr)
+ return 0;
+
+ /*
+ * Calculate total size needed, incl. what's necessary
+ * for good alignment and alias prevention.
+ */
+ cpus = early_nr_cpus_node(node);
+ phys_cpus = early_nr_phys_cpus_node(node);
+ pernodesize += PERCPU_PAGE_SIZE * cpus;
+ pernodesize += node * L1_CACHE_BYTES;
+ pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
+ pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+ pernodesize = PAGE_ALIGN(pernodesize);
+ pernode = NODEDATA_ALIGN(start, node);
+
+ /* Is this range big enough for what we want to store here? */
+ if (start + len > (pernode + pernodesize + mapsize)) {
+ mem_data[node].pernode_addr = pernode;
+ mem_data[node].pernode_size = pernodesize;
+ memset(__va(pernode), 0, pernodesize);
+
+ cpu_data = (void *)pernode;
+ pernode += PERCPU_PAGE_SIZE * cpus;
+ pernode += node * L1_CACHE_BYTES;
+
+ mem_data[node].pgdat = __va(pernode);
+ pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+
+ mem_data[node].node_data = __va(pernode);
+ pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+
+ mem_data[node].pgdat->bdata = bdp;
+ pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+
+ /*
+ * Copy the static per-cpu data into the region we
+ * just set aside and then setup __per_cpu_offset
+ * for each CPU on this node.
+ */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ if (node == node_cpuid[cpu].nid) {
+ memcpy(__va(cpu_data), __phys_per_cpu_start,
+ __per_cpu_end - __per_cpu_start);
+ __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
+ __per_cpu_start;
+ cpu_data += PERCPU_PAGE_SIZE;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * free_node_bootmem - free bootmem allocator memory for use
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * Simply calls the bootmem allocator to free the specified ranged from
+ * the given pg_data_t's bdata struct. After this function has been called
+ * for all the entries in the EFI memory map, the bootmem allocator will
+ * be ready to service allocation requests.
+ */
+static int __init free_node_bootmem(unsigned long start, unsigned long len,
+ int node)
+{
+ free_bootmem_node(mem_data[node].pgdat, start, len);
+
+ return 0;
+}
+
+/**
+ * reserve_pernode_space - reserve memory for per-node space
+ *
+ * Reserve the space used by the bootmem maps & per-node space in the boot
+ * allocator so that when we actually create the real mem maps we don't
+ * use their memory.
+ */
+static void __init reserve_pernode_space(void)
+{
+ unsigned long base, size, pages;
+ struct bootmem_data *bdp;
+ int node;
+
+ for_each_online_node(node) {
+ pg_data_t *pdp = mem_data[node].pgdat;
+
+ bdp = pdp->bdata;
+
+ /* First the bootmem_map itself */
+ pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
+ size = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
+ base = __pa(bdp->node_bootmem_map);
+ reserve_bootmem_node(pdp, base, size);
+
+ /* Now the per-node space */
+ size = mem_data[node].pernode_size;
+ base = __pa(mem_data[node].pernode_addr);
+ reserve_bootmem_node(pdp, base, size);
+ }
+}
+
+/**
+ * initialize_pernode_data - fixup per-cpu & per-node pointers
+ *
+ * Each node's per-node area has a copy of the global pg_data_t list, so
+ * we copy that to each node here, as well as setting the per-cpu pointer
+ * to the local node data structure. The active_cpus field of the per-node
+ * structure gets setup by the platform_cpu_init() function later.
+ */
+static void __init initialize_pernode_data(void)
+{
+ int cpu, node;
+ pg_data_t *pgdat_list[MAX_NUMNODES];
+
+ for_each_online_node(node)
+ pgdat_list[node] = mem_data[node].pgdat;
+
+ /* Copy the pg_data_t list to each node and init the node field */
+ for_each_online_node(node) {
+ memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list,
+ sizeof(pgdat_list));
+ }
+
+ /* Set the node_data pointer for each per-cpu struct */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ node = node_cpuid[cpu].nid;
+ per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data;
+ }
+}
+
+/**
+ * find_memory - walk the EFI memory map and setup the bootmem allocator
+ *
+ * Called early in boot to setup the bootmem allocator, and to
+ * allocate the per-cpu and per-node structures.
+ */
+void __init find_memory(void)
+{
+ int node;
+
+ reserve_memory();
+
+ if (num_online_nodes() == 0) {
+ printk(KERN_ERR "node info missing!\n");
+ node_set_online(0);
+ }
+
+ min_low_pfn = -1;
+ max_low_pfn = 0;
+
+ if (num_online_nodes() > 1)
+ reassign_cpu_only_nodes();
+
+ /* These actually end up getting called by call_pernode_memory() */
+ efi_memmap_walk(filter_rsvd_memory, build_node_maps);
+ efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
+
+ /*
+ * Initialize the boot memory maps in reverse order since that's
+ * what the bootmem allocator expects
+ */
+ for (node = MAX_NUMNODES - 1; node >= 0; node--) {
+ unsigned long pernode, pernodesize, map;
+ struct bootmem_data *bdp;
+
+ if (!node_online(node))
+ continue;
+
+ bdp = &mem_data[node].bootmem_data;
+ pernode = mem_data[node].pernode_addr;
+ pernodesize = mem_data[node].pernode_size;
+ map = pernode + pernodesize;
+
+ /* Sanity check... */
+ if (!pernode)
+ panic("pernode space for node %d "
+ "could not be allocated!", node);
+
+ init_bootmem_node(mem_data[node].pgdat,
+ map>>PAGE_SHIFT,
+ bdp->node_boot_start>>PAGE_SHIFT,
+ bdp->node_low_pfn);
+ }
+
+ efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
+
+ reserve_pernode_space();
+ initialize_pernode_data();
+
+ max_pfn = max_low_pfn;
+
+ find_initrd();
+}
+
+/**
+ * per_cpu_init - setup per-cpu variables
+ *
+ * find_pernode_space() does most of this already, we just need to set
+ * local_per_cpu_offset
+ */
+void *per_cpu_init(void)
+{
+ int cpu;
+
+ if (smp_processor_id() == 0) {
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ per_cpu(local_per_cpu_offset, cpu) =
+ __per_cpu_offset[cpu];
+ }
+ }
+
+ return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
+}
+
+/**
+ * show_mem - give short summary of memory stats
+ *
+ * Shows a simple page count of reserved and used pages in the system.
+ * For discontig machines, it does this on a per-pgdat basis.
+ */
+void show_mem(void)
+{
+ int i, total_reserved = 0;
+ int total_shared = 0, total_cached = 0;
+ unsigned long total_present = 0;
+ pg_data_t *pgdat;
+
+ printk("Mem-info:\n");
+ show_free_areas();
+ printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+ for_each_pgdat(pgdat) {
+ unsigned long present = pgdat->node_present_pages;
+ int shared = 0, cached = 0, reserved = 0;
+ printk("Node ID: %d\n", pgdat->node_id);
+ for(i = 0; i < pgdat->node_spanned_pages; i++) {
+ if (!ia64_pfn_valid(pgdat->node_start_pfn+i))
+ continue;
+ if (PageReserved(pgdat->node_mem_map+i))
+ reserved++;
+ else if (PageSwapCache(pgdat->node_mem_map+i))
+ cached++;
+ else if (page_count(pgdat->node_mem_map+i))
+ shared += page_count(pgdat->node_mem_map+i)-1;
+ }
+ total_present += present;
+ total_reserved += reserved;
+ total_cached += cached;
+ total_shared += shared;
+ printk("\t%ld pages of RAM\n", present);
+ printk("\t%d reserved pages\n", reserved);
+ printk("\t%d pages shared\n", shared);
+ printk("\t%d pages swap cached\n", cached);
+ }
+ printk("%ld pages of RAM\n", total_present);
+ printk("%d reserved pages\n", total_reserved);
+ printk("%d pages shared\n", total_shared);
+ printk("%d pages swap cached\n", total_cached);
+ printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
+ printk("%d free buffer pages\n", nr_free_buffer_pages());
+}
+
+/**
+ * call_pernode_memory - use SRAT to call callback functions with node info
+ * @start: physical start of range
+ * @len: length of range
+ * @arg: function to call for each range
+ *
+ * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
+ * out to which node a block of memory belongs. Ignore memory that we cannot
+ * identify, and split blocks that run across multiple nodes.
+ *
+ * Take this opportunity to round the start address up and the end address
+ * down to page boundaries.
+ */
+void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
+{
+ unsigned long rs, re, end = start + len;
+ void (*func)(unsigned long, unsigned long, int);
+ int i;
+
+ start = PAGE_ALIGN(start);
+ end &= PAGE_MASK;
+ if (start >= end)
+ return;
+
+ func = arg;
+
+ if (!num_node_memblks) {
+ /* No SRAT table, so assume one node (node 0) */
+ if (start < end)
+ (*func)(start, end - start, 0);
+ return;
+ }
+
+ for (i = 0; i < num_node_memblks; i++) {
+ rs = max(start, node_memblk[i].start_paddr);
+ re = min(end, node_memblk[i].start_paddr +
+ node_memblk[i].size);
+
+ if (rs < re)
+ (*func)(rs, re - rs, node_memblk[i].nid);
+
+ if (re == end)
+ break;
+ }
+}
+
+/**
+ * count_node_pages - callback to build per-node memory info structures
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * Each node has it's own number of physical pages, DMAable pages, start, and
+ * end page frame number. This routine will be called by call_pernode_memory()
+ * for each piece of usable memory and will setup these values for each node.
+ * Very similar to build_maps().
+ */
+static __init int count_node_pages(unsigned long start, unsigned long len, int node)
+{
+ unsigned long end = start + len;
+
+ mem_data[node].num_physpages += len >> PAGE_SHIFT;
+ if (start <= __pa(MAX_DMA_ADDRESS))
+ mem_data[node].num_dma_physpages +=
+ (min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT;
+ start = GRANULEROUNDDOWN(start);
+ start = ORDERROUNDDOWN(start);
+ end = GRANULEROUNDUP(end);
+ mem_data[node].max_pfn = max(mem_data[node].max_pfn,
+ end >> PAGE_SHIFT);
+ mem_data[node].min_pfn = min(mem_data[node].min_pfn,
+ start >> PAGE_SHIFT);
+
+ return 0;
+}
+
+/**
+ * paging_init - setup page tables
+ *
+ * paging_init() sets up the page tables for each node of the system and frees
+ * the bootmem allocator memory for general use.
+ */
+void __init paging_init(void)
+{
+ unsigned long max_dma;
+ unsigned long zones_size[MAX_NR_ZONES];
+ unsigned long zholes_size[MAX_NR_ZONES];
+ unsigned long pfn_offset = 0;
+ int node;
+
+ max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+ /* so min() will work in count_node_pages */
+ for_each_online_node(node)
+ mem_data[node].min_pfn = ~0UL;
+
+ efi_memmap_walk(filter_rsvd_memory, count_node_pages);
+
+ for_each_online_node(node) {
+ memset(zones_size, 0, sizeof(zones_size));
+ memset(zholes_size, 0, sizeof(zholes_size));
+
+ num_physpages += mem_data[node].num_physpages;
+
+ if (mem_data[node].min_pfn >= max_dma) {
+ /* All of this node's memory is above ZONE_DMA */
+ zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+ mem_data[node].min_pfn;
+ zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+ mem_data[node].min_pfn -
+ mem_data[node].num_physpages;
+ } else if (mem_data[node].max_pfn < max_dma) {
+ /* All of this node's memory is in ZONE_DMA */
+ zones_size[ZONE_DMA] = mem_data[node].max_pfn -
+ mem_data[node].min_pfn;
+ zholes_size[ZONE_DMA] = mem_data[node].max_pfn -
+ mem_data[node].min_pfn -
+ mem_data[node].num_dma_physpages;
+ } else {
+ /* This node has memory in both zones */
+ zones_size[ZONE_DMA] = max_dma -
+ mem_data[node].min_pfn;
+ zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
+ mem_data[node].num_dma_physpages;
+ zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+ max_dma;
+ zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] -
+ (mem_data[node].num_physpages -
+ mem_data[node].num_dma_physpages);
+ }
+
+ if (node == 0) {
+ vmalloc_end -=
+ PAGE_ALIGN(max_low_pfn * sizeof(struct page));
+ vmem_map = (struct page *) vmalloc_end;
+
+ efi_memmap_walk(create_mem_map_page_table, NULL);
+ printk("Virtual mem_map starts at 0x%p\n", vmem_map);
+ }
+
+ pfn_offset = mem_data[node].min_pfn;
+
+ NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
+ free_area_init_node(node, NODE_DATA(node), zones_size,
+ pfn_offset, zholes_size);
+ }
+
+ zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
+}
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
new file mode 100644
index 000000000000..6d259e34f359
--- /dev/null
+++ b/arch/ia64/mm/extable.c
@@ -0,0 +1,90 @@
+/*
+ * Kernel exception handling table support. Derived from arch/alpha/mm/extable.c.
+ *
+ * Copyright (C) 1998, 1999, 2001-2002, 2004 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#include <linux/config.h>
+#include <linux/sort.h>
+
+#include <asm/uaccess.h>
+#include <asm/module.h>
+
+static int cmp_ex(const void *a, const void *b)
+{
+ const struct exception_table_entry *l = a, *r = b;
+ u64 lip = (u64) &l->addr + l->addr;
+ u64 rip = (u64) &r->addr + r->addr;
+
+ /* avoid overflow */
+ if (lip > rip)
+ return 1;
+ if (lip < rip)
+ return -1;
+ return 0;
+}
+
+static void swap_ex(void *a, void *b, int size)
+{
+ struct exception_table_entry *l = a, *r = b, tmp;
+ u64 delta = (u64) r - (u64) l;
+
+ tmp = *l;
+ l->addr = r->addr + delta;
+ l->cont = r->cont + delta;
+ r->addr = tmp.addr - delta;
+ r->cont = tmp.cont - delta;
+}
+
+/*
+ * Sort the exception table. It's usually already sorted, but there
+ * may be unordered entries due to multiple text sections (such as the
+ * .init text section). Note that the exception-table-entries contain
+ * location-relative addresses, which requires a bit of care during
+ * sorting to avoid overflows in the offset members (e.g., it would
+ * not be safe to make a temporary copy of an exception-table entry on
+ * the stack, because the stack may be more than 2GB away from the
+ * exception-table).
+ */
+void sort_extable (struct exception_table_entry *start,
+ struct exception_table_entry *finish)
+{
+ sort(start, finish - start, sizeof(struct exception_table_entry),
+ cmp_ex, swap_ex);
+}
+
+const struct exception_table_entry *
+search_extable (const struct exception_table_entry *first,
+ const struct exception_table_entry *last,
+ unsigned long ip)
+{
+ const struct exception_table_entry *mid;
+ unsigned long mid_ip;
+ long diff;
+
+ while (first <= last) {
+ mid = &first[(last - first)/2];
+ mid_ip = (u64) &mid->addr + mid->addr;
+ diff = mid_ip - ip;
+ if (diff == 0)
+ return mid;
+ else if (diff < 0)
+ first = mid + 1;
+ else
+ last = mid - 1;
+ }
+ return NULL;
+}
+
+void
+ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e)
+{
+ long fix = (u64) &e->cont + e->cont;
+
+ regs->r8 = -EFAULT;
+ if (fix & 4)
+ regs->r9 = 0;
+ regs->cr_iip = fix & ~0xf;
+ ia64_psr(regs)->ri = fix & 0x3; /* set continuation slot number */
+}
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
new file mode 100644
index 000000000000..da859125aaef
--- /dev/null
+++ b/arch/ia64/mm/fault.c
@@ -0,0 +1,261 @@
+/*
+ * MMU fault handling support.
+ *
+ * Copyright (C) 1998-2002 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+extern void die (char *, struct pt_regs *, long);
+
+/*
+ * This routine is analogous to expand_stack() but instead grows the
+ * register backing store (which grows towards higher addresses).
+ * Since the register backing store is access sequentially, we
+ * disallow growing the RBS by more than a page at a time. Note that
+ * the VM_GROWSUP flag can be set on any VM area but that's fine
+ * because the total process size is still limited by RLIMIT_STACK and
+ * RLIMIT_AS.
+ */
+static inline long
+expand_backing_store (struct vm_area_struct *vma, unsigned long address)
+{
+ unsigned long grow;
+
+ grow = PAGE_SIZE >> PAGE_SHIFT;
+ if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
+ || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
+ return -ENOMEM;
+ vma->vm_end += PAGE_SIZE;
+ vma->vm_mm->total_vm += grow;
+ if (vma->vm_flags & VM_LOCKED)
+ vma->vm_mm->locked_vm += grow;
+ __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
+ return 0;
+}
+
+/*
+ * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
+ * (inside region 5, on ia64) and that page is present.
+ */
+static int
+mapped_kernel_page_is_present (unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+
+ pgd = pgd_offset_k(address);
+ if (pgd_none(*pgd) || pgd_bad(*pgd))
+ return 0;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud) || pud_bad(*pud))
+ return 0;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
+ return 0;
+
+ ptep = pte_offset_kernel(pmd, address);
+ if (!ptep)
+ return 0;
+
+ pte = *ptep;
+ return pte_present(pte);
+}
+
+void
+ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *regs)
+{
+ int signal = SIGSEGV, code = SEGV_MAPERR;
+ struct vm_area_struct *vma, *prev_vma;
+ struct mm_struct *mm = current->mm;
+ struct siginfo si;
+ unsigned long mask;
+
+ /*
+ * If we're in an interrupt or have no user context, we must not take the fault..
+ */
+ if (in_atomic() || !mm)
+ goto no_context;
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+ /*
+ * If fault is in region 5 and we are in the kernel, we may already
+ * have the mmap_sem (pfn_valid macro is called during mmap). There
+ * is no vma for region 5 addr's anyway, so skip getting the semaphore
+ * and go directly to the exception handling code.
+ */
+
+ if ((REGION_NUMBER(address) == 5) && !user_mode(regs))
+ goto bad_area_no_up;
+#endif
+
+ down_read(&mm->mmap_sem);
+
+ vma = find_vma_prev(mm, address, &prev_vma);
+ if (!vma)
+ goto bad_area;
+
+ /* find_vma_prev() returns vma such that address < vma->vm_end or NULL */
+ if (address < vma->vm_start)
+ goto check_expansion;
+
+ good_area:
+ code = SEGV_ACCERR;
+
+ /* OK, we've got a good vm_area for this memory area. Check the access permissions: */
+
+# define VM_READ_BIT 0
+# define VM_WRITE_BIT 1
+# define VM_EXEC_BIT 2
+
+# if (((1 << VM_READ_BIT) != VM_READ || (1 << VM_WRITE_BIT) != VM_WRITE) \
+ || (1 << VM_EXEC_BIT) != VM_EXEC)
+# error File is out of sync with <linux/mm.h>. Please update.
+# endif
+
+ mask = ( (((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
+ | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)
+ | (((isr >> IA64_ISR_R_BIT) & 1UL) << VM_READ_BIT));
+
+ if ((vma->vm_flags & mask) != mask)
+ goto bad_area;
+
+ survive:
+ /*
+ * If for any reason at all we couldn't handle the fault, make
+ * sure we exit gracefully rather than endlessly redo the
+ * fault.
+ */
+ switch (handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0)) {
+ case VM_FAULT_MINOR:
+ ++current->min_flt;
+ break;
+ case VM_FAULT_MAJOR:
+ ++current->maj_flt;
+ break;
+ case VM_FAULT_SIGBUS:
+ /*
+ * We ran out of memory, or some other thing happened
+ * to us that made us unable to handle the page fault
+ * gracefully.
+ */
+ signal = SIGBUS;
+ goto bad_area;
+ case VM_FAULT_OOM:
+ goto out_of_memory;
+ default:
+ BUG();
+ }
+ up_read(&mm->mmap_sem);
+ return;
+
+ check_expansion:
+ if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto bad_area;
+ if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+ || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+ goto bad_area;
+ if (expand_stack(vma, address))
+ goto bad_area;
+ } else {
+ vma = prev_vma;
+ if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+ || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+ goto bad_area;
+ if (expand_backing_store(vma, address))
+ goto bad_area;
+ }
+ goto good_area;
+
+ bad_area:
+ up_read(&mm->mmap_sem);
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+ bad_area_no_up:
+#endif
+ if ((isr & IA64_ISR_SP)
+ || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
+ {
+ /*
+ * This fault was due to a speculative load or lfetch.fault, set the "ed"
+ * bit in the psr to ensure forward progress. (Target register will get a
+ * NaT for ld.s, lfetch will be canceled.)
+ */
+ ia64_psr(regs)->ed = 1;
+ return;
+ }
+ if (user_mode(regs)) {
+ si.si_signo = signal;
+ si.si_errno = 0;
+ si.si_code = code;
+ si.si_addr = (void __user *) address;
+ si.si_isr = isr;
+ si.si_flags = __ISR_VALID;
+ force_sig_info(signal, &si, current);
+ return;
+ }
+
+ no_context:
+ if (isr & IA64_ISR_SP) {
+ /*
+ * This fault was due to a speculative load set the "ed" bit in the psr to
+ * ensure forward progress (target register will get a NaT).
+ */
+ ia64_psr(regs)->ed = 1;
+ return;
+ }
+
+ if (ia64_done_with_exception(regs))
+ return;
+
+ /*
+ * Since we have no vma's for region 5, we might get here even if the address is
+ * valid, due to the VHPT walker inserting a non present translation that becomes
+ * stale. If that happens, the non present fault handler already purged the stale
+ * translation, which fixed the problem. So, we check to see if the translation is
+ * valid, and return if it is.
+ */
+ if (REGION_NUMBER(address) == 5 && mapped_kernel_page_is_present(address))
+ return;
+
+ /*
+ * Oops. The kernel tried to access some bad page. We'll have to terminate things
+ * with extreme prejudice.
+ */
+ bust_spinlocks(1);
+
+ if (address < PAGE_SIZE)
+ printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address);
+ else
+ printk(KERN_ALERT "Unable to handle kernel paging request at "
+ "virtual address %016lx\n", address);
+ die("Oops", regs, isr);
+ bust_spinlocks(0);
+ do_exit(SIGKILL);
+ return;
+
+ out_of_memory:
+ up_read(&mm->mmap_sem);
+ if (current->pid == 1) {
+ yield();
+ down_read(&mm->mmap_sem);
+ goto survive;
+ }
+ printk(KERN_CRIT "VM: killing process %s\n", current->comm);
+ if (user_mode(regs))
+ do_exit(SIGKILL);
+ goto no_context;
+}
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
new file mode 100644
index 000000000000..40ad8328ffd5
--- /dev/null
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -0,0 +1,357 @@
+/*
+ * IA-64 Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2002-2004 Rohit Seth <rohit.seth@intel.com>
+ * Copyright (C) 2003-2004 Ken Chen <kenneth.w.chen@intel.com>
+ *
+ * Sep, 2003: add numa support
+ * Feb, 2004: dynamic hugetlb page size via boot parameter
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
+
+static pte_t *
+huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long taddr = htlbpage_to_page(addr);
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset(mm, taddr);
+ pud = pud_alloc(mm, pgd, taddr);
+ if (pud) {
+ pmd = pmd_alloc(mm, pud, taddr);
+ if (pmd)
+ pte = pte_alloc_map(mm, pmd, taddr);
+ }
+ return pte;
+}
+
+static pte_t *
+huge_pte_offset (struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long taddr = htlbpage_to_page(addr);
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset(mm, taddr);
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, taddr);
+ if (pud_present(*pud)) {
+ pmd = pmd_offset(pud, taddr);
+ if (pmd_present(*pmd))
+ pte = pte_offset_map(pmd, taddr);
+ }
+ }
+
+ return pte;
+}
+
+#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
+
+static void
+set_huge_pte (struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page *page, pte_t * page_table, int write_access)
+{
+ pte_t entry;
+
+ add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+ if (write_access) {
+ entry =
+ pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ } else
+ entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ entry = pte_mkyoung(entry);
+ mk_pte_huge(entry);
+ set_pte(page_table, entry);
+ return;
+}
+/*
+ * This function checks for proper alignment of input addr and len parameters.
+ */
+int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
+{
+ if (len & ~HPAGE_MASK)
+ return -EINVAL;
+ if (addr & ~HPAGE_MASK)
+ return -EINVAL;
+ if (REGION_NUMBER(addr) != REGION_HPAGE)
+ return -EINVAL;
+
+ return 0;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+ struct vm_area_struct *vma)
+{
+ pte_t *src_pte, *dst_pte, entry;
+ struct page *ptepage;
+ unsigned long addr = vma->vm_start;
+ unsigned long end = vma->vm_end;
+
+ while (addr < end) {
+ dst_pte = huge_pte_alloc(dst, addr);
+ if (!dst_pte)
+ goto nomem;
+ src_pte = huge_pte_offset(src, addr);
+ entry = *src_pte;
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ set_pte(dst_pte, entry);
+ add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+ addr += HPAGE_SIZE;
+ }
+ return 0;
+nomem:
+ return -ENOMEM;
+}
+
+int
+follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *st, int *length, int i)
+{
+ pte_t *ptep, pte;
+ unsigned long start = *st;
+ unsigned long pstart;
+ int len = *length;
+ struct page *page;
+
+ do {
+ pstart = start & HPAGE_MASK;
+ ptep = huge_pte_offset(mm, start);
+ pte = *ptep;
+
+back1:
+ page = pte_page(pte);
+ if (pages) {
+ page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
+ get_page(page);
+ pages[i] = page;
+ }
+ if (vmas)
+ vmas[i] = vma;
+ i++;
+ len--;
+ start += PAGE_SIZE;
+ if (((start & HPAGE_MASK) == pstart) && len &&
+ (start < vma->vm_end))
+ goto back1;
+ } while (len && start < vma->vm_end);
+ *length = len;
+ *st = start;
+ return i;
+}
+
+struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
+{
+ struct page *page;
+ pte_t *ptep;
+
+ if (REGION_NUMBER(addr) != REGION_HPAGE)
+ return ERR_PTR(-EINVAL);
+
+ ptep = huge_pte_offset(mm, addr);
+ if (!ptep || pte_none(*ptep))
+ return NULL;
+ page = pte_page(*ptep);
+ page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
+ return page;
+}
+int pmd_huge(pmd_t pmd)
+{
+ return 0;
+}
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
+{
+ return NULL;
+}
+
+/*
+ * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
+ * are hugetlb region specific.
+ */
+void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
+ unsigned long start, unsigned long end)
+{
+ unsigned long first = start & HUGETLB_PGDIR_MASK;
+ unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
+ struct mm_struct *mm = tlb->mm;
+
+ if (!prev) {
+ prev = mm->mmap;
+ if (!prev)
+ goto no_mmaps;
+ if (prev->vm_end > start) {
+ if (last > prev->vm_start)
+ last = prev->vm_start;
+ goto no_mmaps;
+ }
+ }
+ for (;;) {
+ struct vm_area_struct *next = prev->vm_next;
+
+ if (next) {
+ if (next->vm_start < start) {
+ prev = next;
+ continue;
+ }
+ if (last > next->vm_start)
+ last = next->vm_start;
+ }
+ if (prev->vm_end > first)
+ first = prev->vm_end;
+ break;
+ }
+no_mmaps:
+ if (last < first) /* for arches with discontiguous pgd indices */
+ return;
+ clear_page_range(tlb, first, last);
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t *pte;
+ struct page *page;
+
+ BUG_ON(start & (HPAGE_SIZE - 1));
+ BUG_ON(end & (HPAGE_SIZE - 1));
+
+ for (address = start; address < end; address += HPAGE_SIZE) {
+ pte = huge_pte_offset(mm, address);
+ if (pte_none(*pte))
+ continue;
+ page = pte_page(*pte);
+ put_page(page);
+ pte_clear(mm, address, pte);
+ }
+ add_mm_counter(mm, rss, - ((end - start) >> PAGE_SHIFT));
+ flush_tlb_range(vma, start, end);
+}
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr;
+ int ret = 0;
+
+ BUG_ON(vma->vm_start & ~HPAGE_MASK);
+ BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+ spin_lock(&mm->page_table_lock);
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ unsigned long idx;
+ pte_t *pte = huge_pte_alloc(mm, addr);
+ struct page *page;
+
+ if (!pte) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (!pte_none(*pte))
+ continue;
+
+ idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+ page = find_get_page(mapping, idx);
+ if (!page) {
+ /* charge the fs quota first */
+ if (hugetlb_get_quota(mapping)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+ if (! ret) {
+ unlock_page(page);
+ } else {
+ hugetlb_put_quota(mapping);
+ page_cache_release(page);
+ goto out;
+ }
+ }
+ set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+ }
+out:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct vm_area_struct *vmm;
+
+ if (len > RGN_MAP_LIMIT)
+ return -ENOMEM;
+ if (len & ~HPAGE_MASK)
+ return -EINVAL;
+ /* This code assumes that REGION_HPAGE != 0. */
+ if ((REGION_NUMBER(addr) != REGION_HPAGE) || (addr & (HPAGE_SIZE - 1)))
+ addr = HPAGE_REGION_BASE;
+ else
+ addr = ALIGN(addr, HPAGE_SIZE);
+ for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+ /* At this point: (!vmm || addr < vmm->vm_end). */
+ if (REGION_OFFSET(addr) + len > RGN_MAP_LIMIT)
+ return -ENOMEM;
+ if (!vmm || (addr + len) <= vmm->vm_start)
+ return addr;
+ addr = ALIGN(vmm->vm_end, HPAGE_SIZE);
+ }
+}
+
+static int __init hugetlb_setup_sz(char *str)
+{
+ u64 tr_pages;
+ unsigned long long size;
+
+ if (ia64_pal_vm_page_size(&tr_pages, NULL) != 0)
+ /*
+ * shouldn't happen, but just in case.
+ */
+ tr_pages = 0x15557000UL;
+
+ size = memparse(str, &str);
+ if (*str || (size & (size-1)) || !(tr_pages & size) ||
+ size <= PAGE_SIZE ||
+ size >= (1UL << PAGE_SHIFT << MAX_ORDER)) {
+ printk(KERN_WARNING "Invalid huge page size specified\n");
+ return 1;
+ }
+
+ hpage_shift = __ffs(size);
+ /*
+ * boot cpu already executed ia64_mmu_init, and has HPAGE_SHIFT_DEFAULT
+ * override here with new page shift.
+ */
+ ia64_set_rr(HPAGE_REGION_BASE, hpage_shift << 2);
+ return 1;
+}
+__setup("hugepagesz=", hugetlb_setup_sz);
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
new file mode 100644
index 000000000000..65cf839573ea
--- /dev/null
+++ b/arch/ia64/mm/init.c
@@ -0,0 +1,597 @@
+/*
+ * Initialize MMU support.
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <linux/bootmem.h>
+#include <linux/efi.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/personality.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/bitops.h>
+
+#include <asm/a.out.h>
+#include <asm/dma.h>
+#include <asm/ia32.h>
+#include <asm/io.h>
+#include <asm/machvec.h>
+#include <asm/numa.h>
+#include <asm/patch.h>
+#include <asm/pgalloc.h>
+#include <asm/sal.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/tlb.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/mca.h>
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+extern void ia64_tlb_init (void);
+
+unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+unsigned long vmalloc_end = VMALLOC_END_INIT;
+EXPORT_SYMBOL(vmalloc_end);
+struct page *vmem_map;
+EXPORT_SYMBOL(vmem_map);
+#endif
+
+static int pgt_cache_water[2] = { 25, 50 };
+
+struct page *zero_page_memmap_ptr; /* map entry for zero page */
+EXPORT_SYMBOL(zero_page_memmap_ptr);
+
+void
+check_pgt_cache (void)
+{
+ int low, high;
+
+ low = pgt_cache_water[0];
+ high = pgt_cache_water[1];
+
+ preempt_disable();
+ if (pgtable_cache_size > (u64) high) {
+ do {
+ if (pgd_quicklist)
+ free_page((unsigned long)pgd_alloc_one_fast(NULL));
+ if (pmd_quicklist)
+ free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
+ } while (pgtable_cache_size > (u64) low);
+ }
+ preempt_enable();
+}
+
+void
+lazy_mmu_prot_update (pte_t pte)
+{
+ unsigned long addr;
+ struct page *page;
+
+ if (!pte_exec(pte))
+ return; /* not an executable page... */
+
+ page = pte_page(pte);
+ addr = (unsigned long) page_address(page);
+
+ if (test_bit(PG_arch_1, &page->flags))
+ return; /* i-cache is already coherent with d-cache */
+
+ flush_icache_range(addr, addr + PAGE_SIZE);
+ set_bit(PG_arch_1, &page->flags); /* mark page as clean */
+}
+
+inline void
+ia64_set_rbs_bot (void)
+{
+ unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16;
+
+ if (stack_size > MAX_USER_STACK_SIZE)
+ stack_size = MAX_USER_STACK_SIZE;
+ current->thread.rbs_bot = STACK_TOP - stack_size;
+}
+
+/*
+ * This performs some platform-dependent address space initialization.
+ * On IA-64, we want to setup the VM area for the register backing
+ * store (which grows upwards) and install the gateway page which is
+ * used for signal trampolines, etc.
+ */
+void
+ia64_init_addr_space (void)
+{
+ struct vm_area_struct *vma;
+
+ ia64_set_rbs_bot();
+
+ /*
+ * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore
+ * the problem. When the process attempts to write to the register backing store
+ * for the first time, it will get a SEGFAULT in this case.
+ */
+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (vma) {
+ memset(vma, 0, sizeof(*vma));
+ vma->vm_mm = current->mm;
+ vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
+ vma->vm_end = vma->vm_start + PAGE_SIZE;
+ vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
+ vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP;
+ down_write(&current->mm->mmap_sem);
+ if (insert_vm_struct(current->mm, vma)) {
+ up_write(&current->mm->mmap_sem);
+ kmem_cache_free(vm_area_cachep, vma);
+ return;
+ }
+ up_write(&current->mm->mmap_sem);
+ }
+
+ /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
+ if (!(current->personality & MMAP_PAGE_ZERO)) {
+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (vma) {
+ memset(vma, 0, sizeof(*vma));
+ vma->vm_mm = current->mm;
+ vma->vm_end = PAGE_SIZE;
+ vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
+ vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
+ down_write(&current->mm->mmap_sem);
+ if (insert_vm_struct(current->mm, vma)) {
+ up_write(&current->mm->mmap_sem);
+ kmem_cache_free(vm_area_cachep, vma);
+ return;
+ }
+ up_write(&current->mm->mmap_sem);
+ }
+ }
+}
+
+void
+free_initmem (void)
+{
+ unsigned long addr, eaddr;
+
+ addr = (unsigned long) ia64_imva(__init_begin);
+ eaddr = (unsigned long) ia64_imva(__init_end);
+ while (addr < eaddr) {
+ ClearPageReserved(virt_to_page(addr));
+ set_page_count(virt_to_page(addr), 1);
+ free_page(addr);
+ ++totalram_pages;
+ addr += PAGE_SIZE;
+ }
+ printk(KERN_INFO "Freeing unused kernel memory: %ldkB freed\n",
+ (__init_end - __init_begin) >> 10);
+}
+
+void
+free_initrd_mem (unsigned long start, unsigned long end)
+{
+ struct page *page;
+ /*
+ * EFI uses 4KB pages while the kernel can use 4KB or bigger.
+ * Thus EFI and the kernel may have different page sizes. It is
+ * therefore possible to have the initrd share the same page as
+ * the end of the kernel (given current setup).
+ *
+ * To avoid freeing/using the wrong page (kernel sized) we:
+ * - align up the beginning of initrd
+ * - align down the end of initrd
+ *
+ * | |
+ * |=============| a000
+ * | |
+ * | |
+ * | | 9000
+ * |/////////////|
+ * |/////////////|
+ * |=============| 8000
+ * |///INITRD////|
+ * |/////////////|
+ * |/////////////| 7000
+ * | |
+ * |KKKKKKKKKKKKK|
+ * |=============| 6000
+ * |KKKKKKKKKKKKK|
+ * |KKKKKKKKKKKKK|
+ * K=kernel using 8KB pages
+ *
+ * In this example, we must free page 8000 ONLY. So we must align up
+ * initrd_start and keep initrd_end as is.
+ */
+ start = PAGE_ALIGN(start);
+ end = end & PAGE_MASK;
+
+ if (start < end)
+ printk(KERN_INFO "Freeing initrd memory: %ldkB freed\n", (end - start) >> 10);
+
+ for (; start < end; start += PAGE_SIZE) {
+ if (!virt_addr_valid(start))
+ continue;
+ page = virt_to_page(start);
+ ClearPageReserved(page);
+ set_page_count(page, 1);
+ free_page(start);
+ ++totalram_pages;
+ }
+}
+
+/*
+ * This installs a clean page in the kernel's page table.
+ */
+struct page *
+put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (!PageReserved(page))
+ printk(KERN_ERR "put_kernel_page: page at 0x%p not in reserved memory\n",
+ page_address(page));
+
+ pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */
+
+ spin_lock(&init_mm.page_table_lock);
+ {
+ pud = pud_alloc(&init_mm, pgd, address);
+ if (!pud)
+ goto out;
+
+ pmd = pmd_alloc(&init_mm, pud, address);
+ if (!pmd)
+ goto out;
+ pte = pte_alloc_map(&init_mm, pmd, address);
+ if (!pte)
+ goto out;
+ if (!pte_none(*pte)) {
+ pte_unmap(pte);
+ goto out;
+ }
+ set_pte(pte, mk_pte(page, pgprot));
+ pte_unmap(pte);
+ }
+ out: spin_unlock(&init_mm.page_table_lock);
+ /* no need for flush_tlb */
+ return page;
+}
+
+static void
+setup_gate (void)
+{
+ struct page *page;
+
+ /*
+ * Map the gate page twice: once read-only to export the ELF headers etc. and once
+ * execute-only page to enable privilege-promotion via "epc":
+ */
+ page = virt_to_page(ia64_imva(__start_gate_section));
+ put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
+#ifdef HAVE_BUGGY_SEGREL
+ page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
+ put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
+#else
+ put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
+#endif
+ ia64_patch_gate();
+}
+
+void __devinit
+ia64_mmu_init (void *my_cpu_data)
+{
+ unsigned long psr, pta, impl_va_bits;
+ extern void __devinit tlb_init (void);
+
+#ifdef CONFIG_DISABLE_VHPT
+# define VHPT_ENABLE_BIT 0
+#else
+# define VHPT_ENABLE_BIT 1
+#endif
+
+ /* Pin mapping for percpu area into TLB */
+ psr = ia64_clear_ic();
+ ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
+ pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)),
+ PERCPU_PAGE_SHIFT);
+
+ ia64_set_psr(psr);
+ ia64_srlz_i();
+
+ /*
+ * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
+ * address space. The IA-64 architecture guarantees that at least 50 bits of
+ * virtual address space are implemented but if we pick a large enough page size
+ * (e.g., 64KB), the mapped address space is big enough that it will overlap with
+ * VMLPT. I assume that once we run on machines big enough to warrant 64KB pages,
+ * IMPL_VA_MSB will be significantly bigger, so this is unlikely to become a
+ * problem in practice. Alternatively, we could truncate the top of the mapped
+ * address space to not permit mappings that would overlap with the VMLPT.
+ * --davidm 00/12/06
+ */
+# define pte_bits 3
+# define mapped_space_bits (3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT)
+ /*
+ * The virtual page table has to cover the entire implemented address space within
+ * a region even though not all of this space may be mappable. The reason for
+ * this is that the Access bit and Dirty bit fault handlers perform
+ * non-speculative accesses to the virtual page table, so the address range of the
+ * virtual page table itself needs to be covered by virtual page table.
+ */
+# define vmlpt_bits (impl_va_bits - PAGE_SHIFT + pte_bits)
+# define POW2(n) (1ULL << (n))
+
+ impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));
+
+ if (impl_va_bits < 51 || impl_va_bits > 61)
+ panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1);
+
+ /* place the VMLPT at the end of each page-table mapped region: */
+ pta = POW2(61) - POW2(vmlpt_bits);
+
+ if (POW2(mapped_space_bits) >= pta)
+ panic("mm/init: overlap between virtually mapped linear page table and "
+ "mapped kernel space!");
+ /*
+ * Set the (virtually mapped linear) page table address. Bit
+ * 8 selects between the short and long format, bits 2-7 the
+ * size of the table, and bit 0 whether the VHPT walker is
+ * enabled.
+ */
+ ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | VHPT_ENABLE_BIT);
+
+ ia64_tlb_init();
+
+#ifdef CONFIG_HUGETLB_PAGE
+ ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
+ ia64_srlz_d();
+#endif
+}
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+
+int
+create_mem_map_page_table (u64 start, u64 end, void *arg)
+{
+ unsigned long address, start_page, end_page;
+ struct page *map_start, *map_end;
+ int node;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
+ map_end = vmem_map + (__pa(end) >> PAGE_SHIFT);
+
+ start_page = (unsigned long) map_start & PAGE_MASK;
+ end_page = PAGE_ALIGN((unsigned long) map_end);
+ node = paddr_to_nid(__pa(start));
+
+ for (address = start_page; address < end_page; address += PAGE_SIZE) {
+ pgd = pgd_offset_k(address);
+ if (pgd_none(*pgd))
+ pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+ pud = pud_offset(pgd, address);
+
+ if (pud_none(*pud))
+ pud_populate(&init_mm, pud, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+ pmd = pmd_offset(pud, address);
+
+ if (pmd_none(*pmd))
+ pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+ pte = pte_offset_kernel(pmd, address);
+
+ if (pte_none(*pte))
+ set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT,
+ PAGE_KERNEL));
+ }
+ return 0;
+}
+
+struct memmap_init_callback_data {
+ struct page *start;
+ struct page *end;
+ int nid;
+ unsigned long zone;
+};
+
+static int
+virtual_memmap_init (u64 start, u64 end, void *arg)
+{
+ struct memmap_init_callback_data *args;
+ struct page *map_start, *map_end;
+
+ args = (struct memmap_init_callback_data *) arg;
+ map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
+ map_end = vmem_map + (__pa(end) >> PAGE_SHIFT);
+
+ if (map_start < args->start)
+ map_start = args->start;
+ if (map_end > args->end)
+ map_end = args->end;
+
+ /*
+ * We have to initialize "out of bounds" struct page elements that fit completely
+ * on the same pages that were allocated for the "in bounds" elements because they
+ * may be referenced later (and found to be "reserved").
+ */
+ map_start -= ((unsigned long) map_start & (PAGE_SIZE - 1)) / sizeof(struct page);
+ map_end += ((PAGE_ALIGN((unsigned long) map_end) - (unsigned long) map_end)
+ / sizeof(struct page));
+
+ if (map_start < map_end)
+ memmap_init_zone((unsigned long)(map_end - map_start),
+ args->nid, args->zone, page_to_pfn(map_start));
+ return 0;
+}
+
+void
+memmap_init (unsigned long size, int nid, unsigned long zone,
+ unsigned long start_pfn)
+{
+ if (!vmem_map)
+ memmap_init_zone(size, nid, zone, start_pfn);
+ else {
+ struct page *start;
+ struct memmap_init_callback_data args;
+
+ start = pfn_to_page(start_pfn);
+ args.start = start;
+ args.end = start + size;
+ args.nid = nid;
+ args.zone = zone;
+
+ efi_memmap_walk(virtual_memmap_init, &args);
+ }
+}
+
+int
+ia64_pfn_valid (unsigned long pfn)
+{
+ char byte;
+ struct page *pg = pfn_to_page(pfn);
+
+ return (__get_user(byte, (char __user *) pg) == 0)
+ && ((((u64)pg & PAGE_MASK) == (((u64)(pg + 1) - 1) & PAGE_MASK))
+ || (__get_user(byte, (char __user *) (pg + 1) - 1) == 0));
+}
+EXPORT_SYMBOL(ia64_pfn_valid);
+
+int
+find_largest_hole (u64 start, u64 end, void *arg)
+{
+ u64 *max_gap = arg;
+
+ static u64 last_end = PAGE_OFFSET;
+
+ /* NOTE: this algorithm assumes efi memmap table is ordered */
+
+ if (*max_gap < (start - last_end))
+ *max_gap = start - last_end;
+ last_end = end;
+ return 0;
+}
+#endif /* CONFIG_VIRTUAL_MEM_MAP */
+
+static int
+count_reserved_pages (u64 start, u64 end, void *arg)
+{
+ unsigned long num_reserved = 0;
+ unsigned long *count = arg;
+
+ for (; start < end; start += PAGE_SIZE)
+ if (PageReserved(virt_to_page(start)))
+ ++num_reserved;
+ *count += num_reserved;
+ return 0;
+}
+
+/*
+ * Boot command-line option "nolwsys" can be used to disable the use of any light-weight
+ * system call handler. When this option is in effect, all fsyscalls will end up bubbling
+ * down into the kernel and calling the normal (heavy-weight) syscall handler. This is
+ * useful for performance testing, but conceivably could also come in handy for debugging
+ * purposes.
+ */
+
+static int nolwsys;
+
+static int __init
+nolwsys_setup (char *s)
+{
+ nolwsys = 1;
+ return 1;
+}
+
+__setup("nolwsys", nolwsys_setup);
+
+void
+mem_init (void)
+{
+ long reserved_pages, codesize, datasize, initsize;
+ unsigned long num_pgt_pages;
+ pg_data_t *pgdat;
+ int i;
+ static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
+
+#ifdef CONFIG_PCI
+ /*
+ * This needs to be called _after_ the command line has been parsed but _before_
+ * any drivers that may need the PCI DMA interface are initialized or bootmem has
+ * been freed.
+ */
+ platform_dma_init();
+#endif
+
+#ifndef CONFIG_DISCONTIGMEM
+ if (!mem_map)
+ BUG();
+ max_mapnr = max_low_pfn;
+#endif
+
+ high_memory = __va(max_low_pfn * PAGE_SIZE);
+
+ kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE);
+ kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
+ kclist_add(&kcore_kernel, _stext, _end - _stext);
+
+ for_each_pgdat(pgdat)
+ totalram_pages += free_all_bootmem_node(pgdat);
+
+ reserved_pages = 0;
+ efi_memmap_walk(count_reserved_pages, &reserved_pages);
+
+ codesize = (unsigned long) _etext - (unsigned long) _stext;
+ datasize = (unsigned long) _edata - (unsigned long) _etext;
+ initsize = (unsigned long) __init_end - (unsigned long) __init_begin;
+
+ printk(KERN_INFO "Memory: %luk/%luk available (%luk code, %luk reserved, "
+ "%luk data, %luk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10),
+ num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
+ reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
+
+ /*
+ * Allow for enough (cached) page table pages so that we can map the entire memory
+ * at least once. Each task also needs a couple of page tables pages, so add in a
+ * fudge factor for that (don't use "threads-max" here; that would be wrong!).
+ * Don't allow the cache to be more than 10% of total memory, though.
+ */
+# define NUM_TASKS 500 /* typical number of tasks */
+ num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
+ if (num_pgt_pages > nr_free_pages() / 10)
+ num_pgt_pages = nr_free_pages() / 10;
+ if (num_pgt_pages > (u64) pgt_cache_water[1])
+ pgt_cache_water[1] = num_pgt_pages;
+
+ /*
+ * For fsyscall entrpoints with no light-weight handler, use the ordinary
+ * (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry
+ * code can tell them apart.
+ */
+ for (i = 0; i < NR_syscalls; ++i) {
+ extern unsigned long fsyscall_table[NR_syscalls];
+ extern unsigned long sys_call_table[NR_syscalls];
+
+ if (!fsyscall_table[i] || nolwsys)
+ fsyscall_table[i] = sys_call_table[i] | 1;
+ }
+ setup_gate();
+
+#ifdef CONFIG_IA32_SUPPORT
+ ia32_mem_init();
+#endif
+}
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
new file mode 100644
index 000000000000..77118bbf3d8b
--- /dev/null
+++ b/arch/ia64/mm/numa.c
@@ -0,0 +1,49 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * This file contains NUMA specific variables and functions which can
+ * be split away from DISCONTIGMEM and are used on NUMA machines with
+ * contiguous memory.
+ *
+ * 2002/08/07 Erich Focht <efocht@ess.nec.de>
+ */
+
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/node.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <asm/mmzone.h>
+#include <asm/numa.h>
+
+
+/*
+ * The following structures are usually initialized by ACPI or
+ * similar mechanisms and describe the NUMA characteristics of the machine.
+ */
+int num_node_memblks;
+struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
+struct node_cpuid_s node_cpuid[NR_CPUS];
+/*
+ * This is a matrix with "distances" between nodes, they should be
+ * proportional to the memory access latency ratios.
+ */
+u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES];
+
+/* Identify which cnode a physical address resides on */
+int
+paddr_to_nid(unsigned long paddr)
+{
+ int i;
+
+ for (i = 0; i < num_node_memblks; i++)
+ if (paddr >= node_memblk[i].start_paddr &&
+ paddr < node_memblk[i].start_paddr + node_memblk[i].size)
+ break;
+
+ return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0);
+}
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
new file mode 100644
index 000000000000..464557e4ed82
--- /dev/null
+++ b/arch/ia64/mm/tlb.c
@@ -0,0 +1,190 @@
+/*
+ * TLB support routines.
+ *
+ * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 08/02/00 A. Mallick <asit.k.mallick@intel.com>
+ * Modified RID allocation for SMP
+ * Goutham Rao <goutham.rao@intel.com>
+ * IPI based ptc implementation and A-step IPI implementation.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/delay.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+#include <asm/pal.h>
+#include <asm/tlbflush.h>
+
+static struct {
+ unsigned long mask; /* mask of supported purge page-sizes */
+ unsigned long max_bits; /* log2() of largest supported purge page-size */
+} purge;
+
+struct ia64_ctx ia64_ctx = {
+ .lock = SPIN_LOCK_UNLOCKED,
+ .next = 1,
+ .limit = (1 << 15) - 1, /* start out with the safe (architected) limit */
+ .max_ctx = ~0U
+};
+
+DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
+
+/*
+ * Acquire the ia64_ctx.lock before calling this function!
+ */
+void
+wrap_mmu_context (struct mm_struct *mm)
+{
+ unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx;
+ struct task_struct *tsk;
+ int i;
+
+ if (ia64_ctx.next > max_ctx)
+ ia64_ctx.next = 300; /* skip daemons */
+ ia64_ctx.limit = max_ctx + 1;
+
+ /*
+ * Scan all the task's mm->context and set proper safe range
+ */
+
+ read_lock(&tasklist_lock);
+ repeat:
+ for_each_process(tsk) {
+ if (!tsk->mm)
+ continue;
+ tsk_context = tsk->mm->context;
+ if (tsk_context == ia64_ctx.next) {
+ if (++ia64_ctx.next >= ia64_ctx.limit) {
+ /* empty range: reset the range limit and start over */
+ if (ia64_ctx.next > max_ctx)
+ ia64_ctx.next = 300;
+ ia64_ctx.limit = max_ctx + 1;
+ goto repeat;
+ }
+ }
+ if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit))
+ ia64_ctx.limit = tsk_context;
+ }
+ read_unlock(&tasklist_lock);
+ /* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
+ {
+ int cpu = get_cpu(); /* prevent preemption/migration */
+ for (i = 0; i < NR_CPUS; ++i)
+ if (cpu_online(i) && (i != cpu))
+ per_cpu(ia64_need_tlb_flush, i) = 1;
+ put_cpu();
+ }
+ local_flush_tlb_all();
+}
+
+void
+ia64_global_tlb_purge (unsigned long start, unsigned long end, unsigned long nbits)
+{
+ static DEFINE_SPINLOCK(ptcg_lock);
+
+ /* HW requires global serialization of ptc.ga. */
+ spin_lock(&ptcg_lock);
+ {
+ do {
+ /*
+ * Flush ALAT entries also.
+ */
+ ia64_ptcga(start, (nbits<<2));
+ ia64_srlz_i();
+ start += (1UL << nbits);
+ } while (start < end);
+ }
+ spin_unlock(&ptcg_lock);
+}
+
+void
+local_flush_tlb_all (void)
+{
+ unsigned long i, j, flags, count0, count1, stride0, stride1, addr;
+
+ addr = local_cpu_data->ptce_base;
+ count0 = local_cpu_data->ptce_count[0];
+ count1 = local_cpu_data->ptce_count[1];
+ stride0 = local_cpu_data->ptce_stride[0];
+ stride1 = local_cpu_data->ptce_stride[1];
+
+ local_irq_save(flags);
+ for (i = 0; i < count0; ++i) {
+ for (j = 0; j < count1; ++j) {
+ ia64_ptce(addr);
+ addr += stride1;
+ }
+ addr += stride0;
+ }
+ local_irq_restore(flags);
+ ia64_srlz_i(); /* srlz.i implies srlz.d */
+}
+
+void
+flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long size = end - start;
+ unsigned long nbits;
+
+ if (mm != current->active_mm) {
+ /* this does happen, but perhaps it's not worth optimizing for? */
+#ifdef CONFIG_SMP
+ flush_tlb_all();
+#else
+ mm->context = 0;
+#endif
+ return;
+ }
+
+ nbits = ia64_fls(size + 0xfff);
+ while (unlikely (((1UL << nbits) & purge.mask) == 0) && (nbits < purge.max_bits))
+ ++nbits;
+ if (nbits > purge.max_bits)
+ nbits = purge.max_bits;
+ start &= ~((1UL << nbits) - 1);
+
+# ifdef CONFIG_SMP
+ platform_global_tlb_purge(start, end, nbits);
+# else
+ do {
+ ia64_ptcl(start, (nbits<<2));
+ start += (1UL << nbits);
+ } while (start < end);
+# endif
+
+ ia64_srlz_i(); /* srlz.i implies srlz.d */
+}
+EXPORT_SYMBOL(flush_tlb_range);
+
+void __devinit
+ia64_tlb_init (void)
+{
+ ia64_ptce_info_t ptce_info;
+ unsigned long tr_pgbits;
+ long status;
+
+ if ((status = ia64_pal_vm_page_size(&tr_pgbits, &purge.mask)) != 0) {
+ printk(KERN_ERR "PAL_VM_PAGE_SIZE failed with status=%ld;"
+ "defaulting to architected purge page-sizes.\n", status);
+ purge.mask = 0x115557000UL;
+ }
+ purge.max_bits = ia64_fls(purge.mask);
+
+ ia64_get_ptce(&ptce_info);
+ local_cpu_data->ptce_base = ptce_info.base;
+ local_cpu_data->ptce_count[0] = ptce_info.count[0];
+ local_cpu_data->ptce_count[1] = ptce_info.count[1];
+ local_cpu_data->ptce_stride[0] = ptce_info.stride[0];
+ local_cpu_data->ptce_stride[1] = ptce_info.stride[1];
+
+ local_flush_tlb_all(); /* nuke left overs from bootstrapping... */
+}