diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 66 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 148 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 11 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 19 |
5 files changed, 196 insertions, 50 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 23d8e5fecf76..6a19ad9f370d 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_physaddr.o := $(nostackp) CFLAGS_setup_nx.o := $(nostackp) +CFLAGS_fault.o := -I$(src)/../include/asm/trace + obj-$(CONFIG_X86_PAT) += pat_rbtree.o obj-$(CONFIG_SMP) += tlb.o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3aaeffcfd67a..9ff85bb8dd69 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -20,6 +20,9 @@ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_START */ +#define CREATE_TRACE_POINTS +#include <asm/trace/exceptions.h> + /* * Page fault error code bits: * @@ -51,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) return 0; } -static inline int __kprobes notify_page_fault(struct pt_regs *regs) +static inline int __kprobes kprobes_fault(struct pt_regs *regs) { int ret = 0; @@ -596,7 +599,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); - printk_address(regs->ip, 1); + printk_address(regs->ip); dump_pagetable(address); } @@ -1048,7 +1051,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) return; /* kprobes don't want to hook the spurious faults: */ - if (notify_page_fault(regs)) + if (kprobes_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -1060,23 +1063,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) } /* kprobes don't want to hook the spurious faults: */ - if (unlikely(notify_page_fault(regs))) + if (unlikely(kprobes_fault(regs))) return; - /* - * It's safe to allow irq's after cr2 has been saved and the - * vmalloc fault has been handled. - * - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet: - */ - if (user_mode_vm(regs)) { - local_irq_enable(); - error_code |= PF_USER; - flags |= FAULT_FLAG_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); - } if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); @@ -1088,8 +1076,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) } } - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - /* * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: @@ -1099,6 +1085,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) return; } + /* + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet: + */ + if (user_mode_vm(regs)) { + local_irq_enable(); + error_code |= PF_USER; + flags |= FAULT_FLAG_USER; + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + if (error_code & PF_WRITE) flags |= FAULT_FLAG_WRITE; @@ -1231,3 +1235,23 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) __do_page_fault(regs, error_code); exception_exit(prev_state); } + +static void trace_page_fault_entries(struct pt_regs *regs, + unsigned long error_code) +{ + if (user_mode(regs)) + trace_page_fault_user(read_cr2(), regs, error_code); + else + trace_page_fault_kernel(read_cr2(), regs, error_code); +} + +dotraplinkage void __kprobes +trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + enum ctx_state prev_state; + + prev_state = exception_enter(); + trace_page_fault_entries(regs, error_code); + __do_page_fault(regs, error_code); + exception_exit(prev_state); +} diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 04664cdb7fda..f97130618113 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -53,12 +53,12 @@ __ref void *alloc_low_pages(unsigned int num) if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { unsigned long ret; if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_page: ran out of memory"); + panic("alloc_low_pages: ran out of memory"); ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT, PAGE_SIZE * num , PAGE_SIZE); if (!ret) - panic("alloc_low_page: can not alloc memory"); + panic("alloc_low_pages: can not alloc memory"); memblock_reserve(ret, PAGE_SIZE * num); pfn = ret >> PAGE_SHIFT; } else { @@ -399,29 +399,46 @@ static unsigned long __init init_range_memory_mapping( return mapped_ram_size; } -/* (PUD_SHIFT-PMD_SHIFT)/2 */ -#define STEP_SIZE_SHIFT 5 -void __init init_mem_mapping(void) +static unsigned long __init get_new_step_size(unsigned long step_size) +{ + /* + * Explain why we shift by 5 and why we don't have to worry about + * 'step_size << 5' overflowing: + * + * initial mapped size is PMD_SIZE (2M). + * We can not set step_size to be PUD_SIZE (1G) yet. + * In worse case, when we cross the 1G boundary, and + * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) + * to map 1G range with PTE. Use 5 as shift for now. + * + * Don't need to worry about overflow, on 32bit, when step_size + * is 0, round_down() returns 0 for start, and that turns it + * into 0x100000000ULL. + */ + return step_size << 5; +} + +/** + * memory_map_top_down - Map [map_start, map_end) top down + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in top-down. That said, the page tables + * will be allocated at the end of the memory, and we map the + * memory in top-down. + */ +static void __init memory_map_top_down(unsigned long map_start, + unsigned long map_end) { - unsigned long end, real_end, start, last_start; + unsigned long real_end, start, last_start; unsigned long step_size; unsigned long addr; unsigned long mapped_ram_size = 0; unsigned long new_mapped_ram_size; - probe_page_size_mask(); - -#ifdef CONFIG_X86_64 - end = max_pfn << PAGE_SHIFT; -#else - end = max_low_pfn << PAGE_SHIFT; -#endif - - /* the ISA range is always mapped regardless of memory holes */ - init_memory_mapping(0, ISA_END_ADDRESS); - /* xen has big range in reserved near end of ram, skip it at first.*/ - addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, PMD_SIZE); + addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ @@ -436,25 +453,106 @@ void __init init_mem_mapping(void) * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages * for page table. */ - while (last_start > ISA_END_ADDRESS) { + while (last_start > map_start) { if (last_start > step_size) { start = round_down(last_start - 1, step_size); - if (start < ISA_END_ADDRESS) - start = ISA_END_ADDRESS; + if (start < map_start) + start = map_start; } else - start = ISA_END_ADDRESS; + start = map_start; new_mapped_ram_size = init_range_memory_mapping(start, last_start); last_start = start; min_pfn_mapped = last_start >> PAGE_SHIFT; /* only increase step_size after big range get mapped */ if (new_mapped_ram_size > mapped_ram_size) - step_size <<= STEP_SIZE_SHIFT; + step_size = get_new_step_size(step_size); mapped_ram_size += new_mapped_ram_size; } - if (real_end < end) - init_range_memory_mapping(real_end, end); + if (real_end < map_end) + init_range_memory_mapping(real_end, map_end); +} + +/** + * memory_map_bottom_up - Map [map_start, map_end) bottom up + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in bottom-up. Since we have limited the + * bottom-up allocation above the kernel, the page tables will + * be allocated just above the kernel and we map the memory + * in [map_start, map_end) in bottom-up. + */ +static void __init memory_map_bottom_up(unsigned long map_start, + unsigned long map_end) +{ + unsigned long next, new_mapped_ram_size, start; + unsigned long mapped_ram_size = 0; + /* step_size need to be small so pgt_buf from BRK could cover it */ + unsigned long step_size = PMD_SIZE; + + start = map_start; + min_pfn_mapped = start >> PAGE_SHIFT; + + /* + * We start from the bottom (@map_start) and go to the top (@map_end). + * The memblock_find_in_range() gets us a block of RAM from the + * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages + * for page table. + */ + while (start < map_end) { + if (map_end - start > step_size) { + next = round_up(start + 1, step_size); + if (next > map_end) + next = map_end; + } else + next = map_end; + + new_mapped_ram_size = init_range_memory_mapping(start, next); + start = next; + + if (new_mapped_ram_size > mapped_ram_size) + step_size = get_new_step_size(step_size); + mapped_ram_size += new_mapped_ram_size; + } +} + +void __init init_mem_mapping(void) +{ + unsigned long end; + + probe_page_size_mask(); + +#ifdef CONFIG_X86_64 + end = max_pfn << PAGE_SHIFT; +#else + end = max_low_pfn << PAGE_SHIFT; +#endif + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + + /* + * If the allocation is in bottom-up direction, we setup direct mapping + * in bottom-up, otherwise we setup direct mapping in top-down. + */ + if (memblock_bottom_up()) { + unsigned long kernel_end = __pa_symbol(_end); + + /* + * we need two separate calls here. This is because we want to + * allocate page tables above the kernel. So we first map + * [kernel_end, end) to make memory above the kernel be mapped + * as soon as possible. And then use page tables allocated above + * the kernel to map [ISA_END_ADDRESS, kernel_end). + */ + memory_map_bottom_up(kernel_end, end); + memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); + } else { + memory_map_top_down(ISA_END_ADDRESS, end); + } #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8bf93bae1f13..24aec58d6afd 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -567,6 +567,17 @@ static int __init numa_init(int (*init_func)(void)) ret = init_func(); if (ret < 0) return ret; + + /* + * We reset memblock back to the top-down direction + * here because if we configured ACPI_NUMA, we have + * parsed SRAT in init_func(). It is ok to have the + * reset here even if we did't configure ACPI_NUMA + * or acpi numa init fails and fallbacks to dummy + * numa init. + */ + memblock_set_bottom_up(false); + ret = numa_cleanup_meminfo(&numa_meminfo); if (ret < 0) return ret; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index dfa537a03be1..a7cccb6d7fec 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) struct page *pte; pte = alloc_pages(__userpte_alloc_gfp, 0); - if (pte) - pgtable_page_ctor(pte); + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } @@ -189,8 +193,10 @@ static void free_pmds(pmd_t *pmds[]) int i; for(i = 0; i < PREALLOCATED_PMDS; i++) - if (pmds[i]) + if (pmds[i]) { + pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); + } } static int preallocate_pmds(pmd_t *pmds[]) @@ -200,8 +206,13 @@ static int preallocate_pmds(pmd_t *pmds[]) for(i = 0; i < PREALLOCATED_PMDS; i++) { pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); - if (pmd == NULL) + if (!pmd) failed = true; + if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { + free_page((unsigned long)pmds[i]); + pmd = NULL; + failed = true; + } pmds[i] = pmd; } |