summaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorSteve French <sfrench@us.ibm.com>2008-02-15 22:06:08 +0100
committerSteve French <sfrench@us.ibm.com>2008-02-15 22:06:08 +0100
commit0a3abcf75bf391fec4e32356ab5ddb8f5d2e6b41 (patch)
treeb80b1d344ec24cad28b057ef803cebac9434be01 /arch/x86/mm
parent[CIFS] factoring out common code in get_inode_info functions (diff)
parentLinux 2.6.25-rc2 (diff)
downloadlinux-0a3abcf75bf391fec4e32356ab5ddb8f5d2e6b41.tar.xz
linux-0a3abcf75bf391fec4e32356ab5ddb8f5d2e6b41.zip
Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/discontig_32.c3
-rw-r--r--arch/x86/mm/fault.c30
-rw-r--r--arch/x86/mm/init_32.c75
-rw-r--r--arch/x86/mm/init_64.c22
-rw-r--r--arch/x86/mm/ioremap.c57
-rw-r--r--arch/x86/mm/numa_64.c5
-rw-r--r--arch/x86/mm/pageattr-test.c72
-rw-r--r--arch/x86/mm/pageattr.c175
-rw-r--r--arch/x86/mm/pgtable_32.c5
-rw-r--r--arch/x86/mm/srat_64.c3
10 files changed, 304 insertions, 143 deletions
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 04b1d20e2613..c394ca0720b8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -391,7 +391,8 @@ unsigned long __init setup_memory(void)
void __init numa_kva_reserve(void)
{
if (kva_pages)
- reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages));
+ reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
+ BOOTMEM_DEFAULT);
}
void __init zone_sizes_init(void)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ad8b9733d6b3..fdc667422df9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -186,7 +186,7 @@ static int bad_address(void *p)
}
#endif
-void dump_pagetable(unsigned long address)
+static void dump_pagetable(unsigned long address)
{
#ifdef CONFIG_X86_32
__typeof__(pte_val(__pte(0))) page;
@@ -428,6 +428,16 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
}
#endif
+static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+{
+ if ((error_code & PF_WRITE) && !pte_write(*pte))
+ return 0;
+ if ((error_code & PF_INSTR) && !pte_exec(*pte))
+ return 0;
+
+ return 1;
+}
+
/*
* Handle a spurious fault caused by a stale TLB entry. This allows
* us to lazily refresh the TLB when increasing the permissions of a
@@ -457,20 +467,21 @@ static int spurious_fault(unsigned long address,
if (!pud_present(*pud))
return 0;
+ if (pud_large(*pud))
+ return spurious_fault_check(error_code, (pte_t *) pud);
+
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
+ if (pmd_large(*pmd))
+ return spurious_fault_check(error_code, (pte_t *) pmd);
+
pte = pte_offset_kernel(pmd, address);
if (!pte_present(*pte))
return 0;
- if ((error_code & PF_WRITE) && !pte_write(*pte))
- return 0;
- if ((error_code & PF_INSTR) && !pte_exec(*pte))
- return 0;
-
- return 1;
+ return spurious_fault_check(error_code, pte);
}
/*
@@ -947,11 +958,12 @@ void vmalloc_sync_all(void)
for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
if (!test_bit(pgd_index(address), insync)) {
const pgd_t *pgd_ref = pgd_offset_k(address);
+ unsigned long flags;
struct page *page;
if (pgd_none(*pgd_ref))
continue;
- spin_lock(&pgd_lock);
+ spin_lock_irqsave(&pgd_lock, flags);
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
@@ -960,7 +972,7 @@ void vmalloc_sync_all(void)
else
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
}
- spin_unlock(&pgd_lock);
+ spin_unlock_irqrestore(&pgd_lock, flags);
set_bit(pgd_index(address), insync);
}
if (address == start)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index d1bc04006d16..ee1091a46964 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -46,6 +46,8 @@
#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/paravirt.h>
+#include <asm/setup.h>
+#include <asm/cacheflush.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
@@ -328,44 +330,38 @@ pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
void __init native_pagetable_setup_start(pgd_t *base)
{
-#ifdef CONFIG_X86_PAE
- int i;
+ unsigned long pfn, va;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
/*
- * Init entries of the first-level page table to the
- * zero page, if they haven't already been set up.
- *
- * In a normal native boot, we'll be running on a
- * pagetable rooted in swapper_pg_dir, but not in PAE
- * mode, so this will end up clobbering the mappings
- * for the lower 24Mbytes of the address space,
- * without affecting the kernel address space.
+ * Remove any mappings which extend past the end of physical
+ * memory from the boot time page table:
*/
- for (i = 0; i < USER_PTRS_PER_PGD; i++)
- set_pgd(&base[i],
- __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
-
- /* Make sure kernel address space is empty so that a pagetable
- will be allocated for it. */
- memset(&base[USER_PTRS_PER_PGD], 0,
- KERNEL_PGD_PTRS * sizeof(pgd_t));
-#else
+ for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+ va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
+ pgd = base + pgd_index(va);
+ if (!pgd_present(*pgd))
+ break;
+
+ pud = pud_offset(pgd, va);
+ pmd = pmd_offset(pud, va);
+ if (!pmd_present(*pmd))
+ break;
+
+ pte = pte_offset_kernel(pmd, va);
+ if (!pte_present(*pte))
+ break;
+
+ pte_clear(NULL, va, pte);
+ }
paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
-#endif
}
void __init native_pagetable_setup_done(pgd_t *base)
{
-#ifdef CONFIG_X86_PAE
- /*
- * Add low memory identity-mappings - SMP needs it when
- * starting up on an AP from real-mode. In the non-PAE
- * case we already have these mappings through head.S.
- * All user-space mappings are explicitly cleared after
- * SMP startup.
- */
- set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
-#endif
}
/*
@@ -374,9 +370,8 @@ void __init native_pagetable_setup_done(pgd_t *base)
* the boot process.
*
* If we're booting on native hardware, this will be a pagetable
- * constructed in arch/i386/kernel/head.S, and not running in PAE mode
- * (even if we'll end up running in PAE). The root of the pagetable
- * will be swapper_pg_dir.
+ * constructed in arch/x86/kernel/head_32.S. The root of the
+ * pagetable will be swapper_pg_dir.
*
* If we're booting paravirtualized under a hypervisor, then there are
* more options: we may already be running PAE, and the pagetable may
@@ -537,14 +532,6 @@ void __init paging_init(void)
load_cr3(swapper_pg_dir);
-#ifdef CONFIG_X86_PAE
- /*
- * We will bail out later - printk doesn't work right now so
- * the user would just see a hanging kernel.
- */
- if (cpu_has_pae)
- set_in_cr4(X86_CR4_PAE);
-#endif
__flush_tlb_all();
kmap_init();
@@ -675,13 +662,11 @@ void __init mem_init(void)
BUG_ON((unsigned long)high_memory > VMALLOC_START);
#endif /* double-sanity-check paranoia */
-#ifdef CONFIG_X86_PAE
- if (!cpu_has_pae)
- panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
-#endif
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
+ cpa_init();
+
/*
* Subtle. SMP is doing it's boot stuff late (because it has to
* fork idle threads) - but it also needs low mappings for the
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3a98d6f724ab..a4a9cccdd4f2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -45,6 +45,7 @@
#include <asm/sections.h>
#include <asm/kdebug.h>
#include <asm/numa.h>
+#include <asm/cacheflush.h>
const struct dma_mapping_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
@@ -528,13 +529,15 @@ void __init mem_init(void)
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10);
+
+ cpa_init();
}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
- unsigned long addr;
+ unsigned long addr = begin;
- if (begin >= end)
+ if (addr >= end)
return;
/*
@@ -549,7 +552,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
#else
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
+ for (; addr < end; addr += PAGE_SIZE) {
ClearPageReserved(virt_to_page(addr));
init_page_count(virt_to_page(addr));
memset((void *)(addr & ~(PAGE_SIZE-1)),
@@ -591,10 +594,17 @@ void mark_rodata_ro(void)
if (end <= start)
return;
- set_memory_ro(start, (end - start) >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
+ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+
+ /*
+ * The rodata section (but not the kernel text!) should also be
+ * not-executable.
+ */
+ start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+ set_memory_nx(start, (end - start) >> PAGE_SHIFT);
rodata_test();
@@ -637,9 +647,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
- reserve_bootmem_node(NODE_DATA(nid), phys, len);
+ reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
#else
- reserve_bootmem(phys, len);
+ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
#endif
if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
dma_reserve += len / PAGE_SIZE;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index ee6648fe6b15..9f42d7e9c158 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -260,41 +260,48 @@ static int __init early_ioremap_debug_setup(char *str)
early_param("early_ioremap_debug", early_ioremap_debug_setup);
static __initdata int after_paging_init;
-static __initdata unsigned long bm_pte[1024]
+static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
__attribute__((aligned(PAGE_SIZE)));
-static inline unsigned long * __init early_ioremap_pgd(unsigned long addr)
+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
- return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
+ /* Don't assume we're using swapper_pg_dir at this point */
+ pgd_t *base = __va(read_cr3());
+ pgd_t *pgd = &base[pgd_index(addr)];
+ pud_t *pud = pud_offset(pgd, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+
+ return pmd;
}
-static inline unsigned long * __init early_ioremap_pte(unsigned long addr)
+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
{
- return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
+ return &bm_pte[pte_index(addr)];
}
void __init early_ioremap_init(void)
{
- unsigned long *pgd;
+ pmd_t *pmd;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_init()\n");
- pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
- *pgd = __pa(bm_pte) | _PAGE_TABLE;
+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
memset(bm_pte, 0, sizeof(bm_pte));
+ pmd_populate_kernel(&init_mm, pmd, bm_pte);
+
/*
- * The boot-ioremap range spans multiple pgds, for which
+ * The boot-ioremap range spans multiple pmds, for which
* we are not prepared:
*/
- if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) {
+ if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
WARN_ON(1);
- printk(KERN_WARNING "pgd %p != %p\n",
- pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
+ printk(KERN_WARNING "pmd %p != %p\n",
+ pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
- fix_to_virt(FIX_BTMAP_BEGIN));
+ fix_to_virt(FIX_BTMAP_BEGIN));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
- fix_to_virt(FIX_BTMAP_END));
+ fix_to_virt(FIX_BTMAP_END));
printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
@@ -304,28 +311,29 @@ void __init early_ioremap_init(void)
void __init early_ioremap_clear(void)
{
- unsigned long *pgd;
+ pmd_t *pmd;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_clear()\n");
- pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
- *pgd = 0;
- paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT);
+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+ pmd_clear(pmd);
+ paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT);
__flush_tlb_all();
}
void __init early_ioremap_reset(void)
{
enum fixed_addresses idx;
- unsigned long *pte, phys, addr;
+ unsigned long addr, phys;
+ pte_t *pte;
after_paging_init = 1;
for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
addr = fix_to_virt(idx);
pte = early_ioremap_pte(addr);
- if (*pte & _PAGE_PRESENT) {
- phys = *pte & PAGE_MASK;
+ if (pte_present(*pte)) {
+ phys = pte_val(*pte) & PAGE_MASK;
set_fixmap(idx, phys);
}
}
@@ -334,7 +342,8 @@ void __init early_ioremap_reset(void)
static void __init __early_set_fixmap(enum fixed_addresses idx,
unsigned long phys, pgprot_t flags)
{
- unsigned long *pte, addr = __fix_to_virt(idx);
+ unsigned long addr = __fix_to_virt(idx);
+ pte_t *pte;
if (idx >= __end_of_fixed_addresses) {
BUG();
@@ -342,9 +351,9 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
}
pte = early_ioremap_pte(addr);
if (pgprot_val(flags))
- *pte = (phys & PAGE_MASK) | pgprot_val(flags);
+ set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
- *pte = 0;
+ pte_clear(NULL, addr, pte);
__flush_tlb_one(addr);
}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 5a02bf4c91ec..1aecc658cd7d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -238,9 +238,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
free_bootmem_with_active_regions(nodeid, end);
- reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
+ reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size,
+ BOOTMEM_DEFAULT);
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
- bootmap_pages<<PAGE_SHIFT);
+ bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
#ifdef CONFIG_ACPI_NUMA
srat_reserve_add_area(nodeid);
#endif
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 398f3a578dde..75f1b109aae8 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -5,6 +5,7 @@
* and compares page tables forwards and afterwards.
*/
#include <linux/bootmem.h>
+#include <linux/kthread.h>
#include <linux/random.h>
#include <linux/kernel.h>
#include <linux/init.h>
@@ -14,8 +15,13 @@
#include <asm/pgtable.h>
#include <asm/kdebug.h>
+/*
+ * Only print the results of the first pass:
+ */
+static __read_mostly int print = 1;
+
enum {
- NTEST = 4000,
+ NTEST = 400,
#ifdef CONFIG_X86_64
LPS = (1 << PMD_SHIFT),
#elif defined(CONFIG_X86_PAE)
@@ -31,10 +37,9 @@ struct split_state {
long min_exec, max_exec;
};
-static __init int print_split(struct split_state *s)
+static int print_split(struct split_state *s)
{
long i, expected, missed = 0;
- int printed = 0;
int err = 0;
s->lpg = s->gpg = s->spg = s->exec = 0;
@@ -47,12 +52,6 @@ static __init int print_split(struct split_state *s)
pte = lookup_address(addr, &level);
if (!pte) {
- if (!printed) {
- dump_pagetable(addr);
- printk(KERN_INFO "CPA %lx no pte level %d\n",
- addr, level);
- printed = 1;
- }
missed++;
i++;
continue;
@@ -82,10 +81,13 @@ static __init int print_split(struct split_state *s)
s->max_exec = addr;
}
}
- printk(KERN_INFO
- "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
- s->spg, s->lpg, s->gpg, s->exec,
- s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed);
+ if (print) {
+ printk(KERN_INFO
+ " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
+ s->spg, s->lpg, s->gpg, s->exec,
+ s->min_exec != ~0UL ? s->min_exec : 0,
+ s->max_exec, missed);
+ }
expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
if (expected != i) {
@@ -96,11 +98,11 @@ static __init int print_split(struct split_state *s)
return err;
}
-static unsigned long __initdata addr[NTEST];
-static unsigned int __initdata len[NTEST];
+static unsigned long addr[NTEST];
+static unsigned int len[NTEST];
/* Change the global bit on random pages in the direct mapping */
-static __init int exercise_pageattr(void)
+static int pageattr_test(void)
{
struct split_state sa, sb, sc;
unsigned long *bm;
@@ -110,7 +112,8 @@ static __init int exercise_pageattr(void)
int i, k;
int err;
- printk(KERN_INFO "CPA exercising pageattr\n");
+ if (print)
+ printk(KERN_INFO "CPA self-test:\n");
bm = vmalloc((max_pfn_mapped + 7) / 8);
if (!bm) {
@@ -186,7 +189,6 @@ static __init int exercise_pageattr(void)
failed += print_split(&sb);
- printk(KERN_INFO "CPA reverting everything\n");
for (i = 0; i < NTEST; i++) {
if (!addr[i])
continue;
@@ -214,12 +216,40 @@ static __init int exercise_pageattr(void)
failed += print_split(&sc);
if (failed) {
- printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n");
+ printk(KERN_ERR "NOT PASSED. Please report.\n");
WARN_ON(1);
+ return -EINVAL;
} else {
- printk(KERN_INFO "CPA selftests PASSED\n");
+ if (print)
+ printk(KERN_INFO "ok.\n");
+ }
+
+ return 0;
+}
+
+static int do_pageattr_test(void *__unused)
+{
+ while (!kthread_should_stop()) {
+ schedule_timeout_interruptible(HZ*30);
+ if (pageattr_test() < 0)
+ break;
+ if (print)
+ print--;
}
+ return 0;
+}
+
+static int start_pageattr_test(void)
+{
+ struct task_struct *p;
+
+ p = kthread_create(do_pageattr_test, NULL, "pageattr-test");
+ if (!IS_ERR(p))
+ wake_up_process(p);
+ else
+ WARN_ON(1);
return 0;
}
-module_init(exercise_pageattr);
+
+module_init(start_pageattr_test);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 16ce841f08d6..4119379f80ff 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/interrupt.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -167,8 +168,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext)))
pgprot_val(forbidden) |= _PAGE_NX;
-
-#ifdef CONFIG_DEBUG_RODATA
/* The .rodata section needs to be read-only */
if (within(address, (unsigned long)__start_rodata,
(unsigned long)__end_rodata))
@@ -179,7 +178,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
if (within(address, virt_to_highmap(__start_rodata),
virt_to_highmap(__end_rodata)))
pgprot_val(forbidden) |= _PAGE_RW;
-#endif
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
@@ -194,7 +192,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
* or when the present bit is not set. Otherwise we would return a
* pointer to a nonexisting mapping.
*/
-pte_t *lookup_address(unsigned long address, int *level)
+pte_t *lookup_address(unsigned long address, unsigned int *level)
{
pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
@@ -255,21 +253,11 @@ static int
try_preserve_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
{
- unsigned long nextpage_addr, numpages, pmask, psize, flags;
+ unsigned long nextpage_addr, numpages, pmask, psize, flags, addr;
pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot;
- int level, do_split = 1;
-
- /*
- * An Athlon 64 X2 showed hard hangs if we tried to preserve
- * largepages and changed the PSE entry from RW to RO.
- *
- * As AMD CPUs have a long series of erratas in this area,
- * (and none of the known ones seem to explain this hang),
- * disable this code until the hang can be debugged:
- */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
- return 1;
+ int i, do_split = 1;
+ unsigned int level;
spin_lock_irqsave(&pgd_lock, flags);
/*
@@ -287,8 +275,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
break;
#ifdef CONFIG_X86_64
case PG_LEVEL_1G:
- psize = PMD_PAGE_SIZE;
- pmask = PMD_PAGE_MASK;
+ psize = PUD_PAGE_SIZE;
+ pmask = PUD_PAGE_MASK;
break;
#endif
default:
@@ -316,6 +304,19 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(new_prot, address);
/*
+ * We need to check the full range, whether
+ * static_protection() requires a different pgprot for one of
+ * the pages in the range we try to preserve:
+ */
+ addr = address + PAGE_SIZE;
+ for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE) {
+ pgprot_t chk_prot = static_protections(new_prot, addr);
+
+ if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+ goto out_unlock;
+ }
+
+ /*
* If there are no changes, return. maxpages has been updated
* above:
*/
@@ -349,23 +350,103 @@ out_unlock:
return do_split;
}
+static LIST_HEAD(page_pool);
+static unsigned long pool_size, pool_pages, pool_low;
+static unsigned long pool_used, pool_failed, pool_refill;
+
+static void cpa_fill_pool(void)
+{
+ struct page *p;
+ gfp_t gfp = GFP_KERNEL;
+
+ /* Do not allocate from interrupt context */
+ if (in_irq() || irqs_disabled())
+ return;
+ /*
+ * Check unlocked. I does not matter when we have one more
+ * page in the pool. The bit lock avoids recursive pool
+ * allocations:
+ */
+ if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill))
+ return;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * We could do:
+ * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
+ * but this fails on !PREEMPT kernels
+ */
+ gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
+#endif
+
+ while (pool_pages < pool_size) {
+ p = alloc_pages(gfp, 0);
+ if (!p) {
+ pool_failed++;
+ break;
+ }
+ spin_lock_irq(&pgd_lock);
+ list_add(&p->lru, &page_pool);
+ pool_pages++;
+ spin_unlock_irq(&pgd_lock);
+ }
+ clear_bit_unlock(0, &pool_refill);
+}
+
+#define SHIFT_MB (20 - PAGE_SHIFT)
+#define ROUND_MB_GB ((1 << 10) - 1)
+#define SHIFT_MB_GB 10
+#define POOL_PAGES_PER_GB 16
+
+void __init cpa_init(void)
+{
+ struct sysinfo si;
+ unsigned long gb;
+
+ si_meminfo(&si);
+ /*
+ * Calculate the number of pool pages:
+ *
+ * Convert totalram (nr of pages) to MiB and round to the next
+ * GiB. Shift MiB to Gib and multiply the result by
+ * POOL_PAGES_PER_GB:
+ */
+ gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
+ pool_size = POOL_PAGES_PER_GB * gb;
+ pool_low = pool_size;
+
+ cpa_fill_pool();
+ printk(KERN_DEBUG
+ "CPA: page pool initialized %lu of %lu pages preallocated\n",
+ pool_pages, pool_size);
+}
+
static int split_large_page(pte_t *kpte, unsigned long address)
{
unsigned long flags, pfn, pfninc = 1;
- gfp_t gfp_flags = GFP_KERNEL;
unsigned int i, level;
pte_t *pbase, *tmp;
pgprot_t ref_prot;
struct page *base;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
-#endif
- base = alloc_pages(gfp_flags, 0);
- if (!base)
+ /*
+ * Get a page from the pool. The pool list is protected by the
+ * pgd_lock, which we have to take anyway for the split
+ * operation:
+ */
+ spin_lock_irqsave(&pgd_lock, flags);
+ if (list_empty(&page_pool)) {
+ spin_unlock_irqrestore(&pgd_lock, flags);
return -ENOMEM;
+ }
+
+ base = list_first_entry(&page_pool, struct page, lru);
+ list_del(&base->lru);
+ pool_pages--;
+
+ if (pool_pages < pool_low)
+ pool_low = pool_pages;
- spin_lock_irqsave(&pgd_lock, flags);
/*
* Check for races, another CPU might have split this page
* up for us already:
@@ -410,17 +491,24 @@ static int split_large_page(pte_t *kpte, unsigned long address)
base = NULL;
out_unlock:
+ /*
+ * If we dropped out via the lookup_address check under
+ * pgd_lock then stick the page back into the pool:
+ */
+ if (base) {
+ list_add(&base->lru, &page_pool);
+ pool_pages++;
+ } else
+ pool_used++;
spin_unlock_irqrestore(&pgd_lock, flags);
- if (base)
- __free_pages(base, 0);
-
return 0;
}
static int __change_page_attr(unsigned long address, struct cpa_data *cpa)
{
- int level, do_split, err;
+ int do_split, err;
+ unsigned int level;
struct page *kpte_page;
pte_t *kpte;
@@ -600,6 +688,15 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
return 0;
+ /* Ensure we are PAGE_SIZE aligned */
+ if (addr & ~PAGE_MASK) {
+ addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
+
cpa.vaddr = addr;
cpa.numpages = numpages;
cpa.mask_set = mask_set;
@@ -612,7 +709,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
* Check whether we really changed something:
*/
if (!cpa.flushtlb)
- return ret;
+ goto out;
/*
* No need to flush, when we did not set any of the caching
@@ -631,6 +728,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
else
cpa_flush_all(cache);
+out:
+ cpa_fill_pool();
return ret;
}
@@ -771,8 +870,12 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
return;
/*
- * The return value is ignored - the calls cannot fail,
- * large pages are disabled at boot time:
+ * The return value is ignored as the calls cannot fail.
+ * Large pages are kept enabled at boot time, and are
+ * split up quickly with DEBUG_PAGEALLOC. If a splitup
+ * fails here (due to temporary memory shortage) no damage
+ * is done because we just keep the largepage intact up
+ * to the next attempt when it will likely be split up:
*/
if (enable)
__set_pages_p(page, numpages);
@@ -784,6 +887,12 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
* but that can deadlock->flush only current cpu:
*/
__flush_tlb_all();
+
+ /*
+ * Try to refill the page pool here. We can do this only after
+ * the tlb flush.
+ */
+ cpa_fill_pool();
}
#endif
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 6c1914622a88..73aba7125203 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -183,7 +183,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
}
-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
struct page *pte;
@@ -192,6 +192,8 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
#else
pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
#endif
+ if (pte)
+ pgtable_page_ctor(pte);
return pte;
}
@@ -365,6 +367,7 @@ void check_pgt_cache(void)
void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
+ pgtable_page_dtor(pte);
paravirt_release_pt(page_to_pfn(pte));
tlb_remove_page(tlb, pte);
}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 65416f843e59..ecd91ea8a8ae 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -488,7 +488,8 @@ void __init srat_reserve_add_area(int nodeid)
printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
"pre-allocated memory.\n", (unsigned long long)total_mb);
reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
- nodes_add[nodeid].end - nodes_add[nodeid].start);
+ nodes_add[nodeid].end - nodes_add[nodeid].start,
+ BOOTMEM_DEFAULT);
}
}