From 6afb5157b9eba4092e2f0f54d24a3806409bdde5 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Wed, 19 May 2010 17:42:14 +0800 Subject: x86, mm: Separate x86_64 vmalloc_sync_all() into separate functions No behavior change. Move some of vmalloc_sync_all() code into a new function sync_global_pgds() that will be useful for memory hotplug. Signed-off-by: Haicheng Li LKML-Reference: <4C6E4ECD.1090607@linux.intel.com> Reviewed-by: Wu Fengguang Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a6674689a20..61a1b4fdecbf 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -97,6 +97,36 @@ static int __init nonx32_setup(char *str) } __setup("noexec32=", nonx32_setup); +/* + * When memory was added/removed make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. -- cgit v1.2.3 From 9b861528a8012e7bc4d1f7bae07395b225331477 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Fri, 20 Aug 2010 17:50:16 +0800 Subject: x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes When memory hotplug-adding happens for a large enough area that a new PGD entry is needed for the direct mapping, the PGDs of other processes would not get updated. This leads to some CPUs oopsing like below when they have to access the unmapped areas. [ 1139.243192] BUG: soft lockup - CPU#0 stuck for 61s! [bash:6534] [ 1139.243195] Modules linked in: ipv6 autofs4 rfcomm l2cap crc16 bluetooth rfkill binfmt_misc dm_mirror dm_region_hash dm_log dm_multipath dm_mod video output sbs sbshc fan battery ac parport_pc lp parport joydev usbhid processor thermal thermal_sys container button rtc_cmos rtc_core rtc_lib i2c_i801 i2c_core pcspkr uhci_hcd ohci_hcd ehci_hcd usbcore [ 1139.243229] irq event stamp: 8538759 [ 1139.243230] hardirqs last enabled at (8538759): [] restore_args+0x0/0x30 [ 1139.243236] hardirqs last disabled at (8538757): [] __do_softirq+0x106/0x146 [ 1139.243240] softirqs last enabled at (8538758): [] __do_softirq+0x137/0x146 [ 1139.243245] softirqs last disabled at (8538743): [] call_softirq+0x1c/0x34 [ 1139.243249] CPU 0: [ 1139.243250] Modules linked in: ipv6 autofs4 rfcomm l2cap crc16 bluetooth rfkill binfmt_misc dm_mirror dm_region_hash dm_log dm_multipath dm_mod video output sbs sbshc fan battery ac parport_pc lp parport joydev usbhid processor thermal thermal_sys container button rtc_cmos rtc_core rtc_lib i2c_i801 i2c_core pcspkr uhci_hcd ohci_hcd ehci_hcd usbcore [ 1139.243284] Pid: 6534, comm: bash Tainted: G M 2.6.32-haicheng-cpuhp #7 QSSC-S4R [ 1139.243287] RIP: 0010:[] [] alloc_arraycache+0x35/0x69 [ 1139.243292] RSP: 0018:ffff8802799f9d78 EFLAGS: 00010286 [ 1139.243295] RAX: ffff8884ffc00000 RBX: ffff8802799f9d98 RCX: 0000000000000000 [ 1139.243297] RDX: 0000000000190018 RSI: 0000000000000001 RDI: ffff8884ffc00010 [ 1139.243300] RBP: ffffffff8100c34e R08: 0000000000000002 R09: 0000000000000000 [ 1139.243303] R10: ffffffff8246dda0 R11: 000000d08246dda0 R12: ffff8802599bfff0 [ 1139.243305] R13: ffff88027904c040 R14: ffff8802799f8000 R15: 0000000000000001 [ 1139.243308] FS: 00007fe81bfe86e0(0000) GS:ffff88000d800000(0000) knlGS:0000000000000000 [ 1139.243311] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1139.243313] CR2: ffff8884ffc00000 CR3: 000000026cf2d000 CR4: 00000000000006f0 [ 1139.243316] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1139.243318] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 1139.243321] Call Trace: [ 1139.243324] [] ? alloc_arraycache+0x29/0x69 [ 1139.243328] [] ? cpuup_callback+0x1b0/0x32a [ 1139.243333] [] ? notifier_call_chain+0x33/0x5b [ 1139.243337] [] ? __raw_notifier_call_chain+0x9/0xb [ 1139.243340] [] ? cpu_up+0xb3/0x152 [ 1139.243344] [] ? store_online+0x4d/0x75 [ 1139.243348] [] ? sysdev_store+0x1b/0x1d [ 1139.243351] [] ? sysfs_write_file+0xe5/0x121 [ 1139.243355] [] ? vfs_write+0xae/0x14a [ 1139.243358] [] ? sys_write+0x47/0x6f [ 1139.243362] [] ? system_call_fastpath+0x16/0x1b This patch makes sure to always replicate new direct mapping PGD entries to the PGDs of all processes, as well as ensures corresponding vmemmap mapping gets synced. V1: initial code by Andi Kleen. V2: fix several issues found in testing. V3: as suggested by Wu Fengguang, reuse common code of vmalloc_sync_all(). [ hpa: changed pgd_change from int to bool ] Originally-by: Andi Kleen Signed-off-by: Haicheng Li LKML-Reference: <4C6E4FD8.6080100@linux.intel.com> Reviewed-by: Wu Fengguang Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 61a1b4fdecbf..64e7bc2ded93 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -564,8 +564,9 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { - + bool pgd_changed = false; unsigned long next, last_map_addr = end; + unsigned long addr; start = (unsigned long)__va(start); end = (unsigned long)__va(end); @@ -593,7 +594,12 @@ kernel_physical_mapping_init(unsigned long start, spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; } + + if (pgd_changed) + sync_global_pgds(addr, end); + __flush_tlb_all(); return last_map_addr; @@ -1033,6 +1039,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) } } + sync_global_pgds((unsigned long)start_page, end); return 0; } -- cgit v1.2.3 From f88eff74aa848e58b1ea49768c0bbb874b31357f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 Aug 2010 13:39:15 -0700 Subject: bootmem, x86: Add weak version of reserve_bootmem_generic It will be used memblock_x86_to_bootmem converting It is an wrapper for reserve_bootmem, and x86 64bit is using special one. Also clean up that version for x86_64. We don't need to take care of numa path for that, bootmem can handle it how Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_32.c | 6 ------ arch/x86/mm/init_64.c | 20 ++------------------ mm/bootmem.c | 6 ++++++ 3 files changed, 8 insertions(+), 24 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bca79091b9d6..90e054589aae 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1069,9 +1069,3 @@ void mark_rodata_ro(void) #endif } #endif - -int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, - int flags) -{ - return reserve_bootmem(phys, len, flags); -} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ee41bba315d1..634fa0884a41 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -799,13 +799,10 @@ void mark_rodata_ro(void) #endif +#ifndef CONFIG_NO_BOOTMEM int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { -#ifdef CONFIG_NUMA - int nid, next_nid; - int ret; -#endif unsigned long pfn = phys >> PAGE_SHIFT; if (pfn >= max_pfn) { @@ -821,21 +818,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, return -EFAULT; } - /* Should check here against the e820 map to avoid double free */ -#ifdef CONFIG_NUMA - nid = phys_to_nid(phys); - next_nid = phys_to_nid(phys + len - 1); - if (nid == next_nid) - ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); - else - ret = reserve_bootmem(phys, len, flags); - - if (ret != 0) - return ret; - -#else reserve_bootmem(phys, len, flags); -#endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; @@ -844,6 +827,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, return 0; } +#endif int kern_addr_valid(unsigned long addr) { diff --git a/mm/bootmem.c b/mm/bootmem.c index 142c84a54993..bde170dd2fde 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -526,6 +526,12 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, } #ifndef CONFIG_NO_BOOTMEM +int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) +{ + return reserve_bootmem(phys, len, flags); +} + static unsigned long __init align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) { -- cgit v1.2.3 From a9ce6bc15100023b411f8117e53a016d61889800 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 Aug 2010 13:39:17 -0700 Subject: x86, memblock: Replace e820_/_early string with memblock_ 1.include linux/memblock.h directly. so later could reduce e820.h reference. 2 this patch is done by sed scripts mainly -v2: use MEMBLOCK_ERROR instead of -1ULL or -1UL Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/efi.h | 2 +- arch/x86/kernel/acpi/sleep.c | 9 +++++---- arch/x86/kernel/apic/numaq_32.c | 3 ++- arch/x86/kernel/efi.c | 5 +++-- arch/x86/kernel/head32.c | 4 ++-- arch/x86/kernel/head64.c | 4 ++-- arch/x86/kernel/setup.c | 29 ++++++++++++++--------------- arch/x86/kernel/trampoline.c | 10 +++++----- arch/x86/mm/init.c | 10 ++++++---- arch/x86/mm/init_32.c | 14 ++++++++------ arch/x86/mm/init_64.c | 11 ++++++----- arch/x86/mm/k8topology_64.c | 4 +++- arch/x86/mm/memtest.c | 7 +++---- arch/x86/mm/numa_32.c | 25 +++++++++++++------------ arch/x86/mm/numa_64.c | 34 +++++++++++++++++----------------- arch/x86/mm/srat_32.c | 3 ++- arch/x86/mm/srat_64.c | 11 ++++++----- arch/x86/xen/mmu.c | 5 +++-- arch/x86/xen/setup.c | 3 ++- mm/bootmem.c | 4 ++-- 20 files changed, 105 insertions(+), 92 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 8406ed7f9926..8e4a16508d4e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -90,7 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, #endif /* CONFIG_X86_32 */ extern int add_efi_memmap; -extern void efi_reserve_early(void); +extern void efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index fcc3c61fdecc..d829e75f9687 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -125,7 +126,7 @@ void acpi_restore_state_mem(void) */ void __init acpi_reserve_wakeup_memory(void) { - unsigned long mem; + phys_addr_t mem; if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { printk(KERN_ERR @@ -133,15 +134,15 @@ void __init acpi_reserve_wakeup_memory(void) return; } - mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); + mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); - if (mem == -1L) { + if (mem == MEMBLOCK_ERROR) { printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); return; } acpi_realmode = (unsigned long) phys_to_virt(mem); acpi_wakeup_address = mem; - reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); + memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 3e28401f161c..960f26ab5c9f 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -88,7 +89,7 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd) node_end_pfn[node] = MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); - e820_register_active_regions(node, node_start_pfn[node], + memblock_x86_register_active_regions(node, node_start_pfn[node], node_end_pfn[node]); memory_present(node, node_start_pfn[node], node_end_pfn[node]); diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index c2fa9b8b497e..0fe27d7c6258 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -275,7 +276,7 @@ static void __init do_add_efi_memmap(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } -void __init efi_reserve_early(void) +void __init efi_memblock_x86_reserve_range(void) { unsigned long pmap; @@ -290,7 +291,7 @@ void __init efi_reserve_early(void) boot_params.efi_info.efi_memdesc_size; memmap.desc_version = boot_params.efi_info.efi_memdesc_version; memmap.desc_size = boot_params.efi_info.efi_memdesc_size; - reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size, + memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size, "EFI memmap"); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index da60aa8a850f..74e4cf65043e 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -42,7 +42,7 @@ void __init i386_start_kernel(void) memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); #endif - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ @@ -51,7 +51,7 @@ void __init i386_start_kernel(void) u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); + memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8ee930fdeeb9..97adf9828b95 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -101,7 +101,7 @@ void __init x86_64_start_reservations(char *real_mode_data) memblock_init(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ @@ -110,7 +110,7 @@ void __init x86_64_start_reservations(char *real_mode_data) unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); + memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbe0aaf77494..a4f01733e879 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -302,7 +302,7 @@ static inline void init_gbpages(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); + memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -324,17 +324,16 @@ static void __init relocate_initrd(void) char *p, *q; /* We need to move the initrd down into lowmem */ - ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, + ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, PAGE_SIZE); - if (ramdisk_here == -1ULL) + if (ramdisk_here == MEMBLOCK_ERROR) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_early(ramdisk_here, ramdisk_here + area_size, - "NEW RAMDISK"); + memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", @@ -390,7 +389,7 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= (end_of_lowmem>>1)) { - free_early(ramdisk_image, ramdisk_end); + memblock_x86_free_range(ramdisk_image, ramdisk_end); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; @@ -413,7 +412,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - free_early(ramdisk_image, ramdisk_end); + memblock_x86_free_range(ramdisk_image, ramdisk_end); } #else static void __init reserve_initrd(void) @@ -469,7 +468,7 @@ static void __init e820_reserve_setup_data(void) e820_print_map("reserve setup_data"); } -static void __init reserve_early_setup_data(void) +static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_data *data; u64 pa_data; @@ -481,7 +480,7 @@ static void __init reserve_early_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); sprintf(buf, "setup data %x", data->type); - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); + memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf); pa_data = data->next; early_iounmap(data, sizeof(*data)); } @@ -519,23 +518,23 @@ static void __init reserve_crashkernel(void) if (crash_base <= 0) { const unsigned long long alignment = 16<<20; /* 16M */ - crash_base = find_e820_area(alignment, ULONG_MAX, crash_size, + crash_base = memblock_find_in_range(alignment, ULONG_MAX, crash_size, alignment); - if (crash_base == -1ULL) { + if (crash_base == MEMBLOCK_ERROR) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } } else { unsigned long long start; - start = find_e820_area(crash_base, ULONG_MAX, crash_size, + start = memblock_find_in_range(crash_base, ULONG_MAX, crash_size, 1<<20); if (start != crash_base) { pr_info("crashkernel reservation failed - memory is in use.\n"); return; } } - reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL"); + memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", @@ -786,7 +785,7 @@ void __init setup_arch(char **cmdline_p) #endif 4)) { efi_enabled = 1; - efi_reserve_early(); + efi_memblock_x86_reserve_range(); } #endif @@ -846,7 +845,7 @@ void __init setup_arch(char **cmdline_p) vmi_activate(); /* after early param, so could get panic from serial */ - reserve_early_setup_data(); + memblock_x86_reserve_range_setup_data(); if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index c652ef62742d..7c2102c2aadf 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -1,7 +1,7 @@ #include +#include #include -#include #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) #define __trampinit @@ -16,15 +16,15 @@ unsigned char *__trampinitdata trampoline_base; void __init reserve_trampoline_memory(void) { - unsigned long mem; + phys_addr_t mem; /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); - if (mem == -1L) + mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); + if (mem == MEMBLOCK_ERROR) panic("Cannot allocate trampoline\n"); trampoline_base = __va(mem); - reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); + memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); } /* diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index b278535b14aa..c0e28a13de7d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { unsigned long puds, pmds, ptes, tables, start; + phys_addr_t base; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); @@ -75,12 +77,12 @@ static void __init find_early_table_space(unsigned long end, int use_pse, #else start = 0x8000; #endif - e820_table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; + e820_table_start = base >> PAGE_SHIFT; e820_table_end = e820_table_start; e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); @@ -299,7 +301,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); if (!after_bootmem && e820_table_end > e820_table_start) - reserve_early(e820_table_start << PAGE_SHIFT, + memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, e820_table_end << PAGE_SHIFT, "PGTABLE"); if (!after_bootmem) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 90e054589aae..63b09bae2509 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -712,14 +713,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; - e820_register_active_regions(0, 0, highend_pfn); + memblock_x86_register_active_regions(0, 0, highend_pfn); sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - e820_register_active_regions(0, 0, max_low_pfn); + memblock_x86_register_active_regions(0, 0, max_low_pfn); sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; @@ -776,16 +777,16 @@ void __init setup_bootmem_allocator(void) { #ifndef CONFIG_NO_BOOTMEM int nodeid; - unsigned long bootmap_size, bootmap; + phys_addr_t bootmap_size, bootmap; /* * Initialize the boot-time allocator (with low memory only): */ bootmap_size = bootmem_bootmap_pages(max_low_pfn)< #include #include +#include #include #include #include @@ -577,18 +578,18 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, unsigned long bootmap_size, bootmap; bootmap_size = bootmem_bootmap_pages(end_pfn)<> PAGE_SHIFT, 0, end_pfn); - e820_register_active_regions(0, start_pfn, end_pfn); + memblock_x86_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); #else - e820_register_active_regions(0, start_pfn, end_pfn); + memblock_x86_register_active_regions(0, start_pfn, end_pfn); #endif } #endif diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 970ed579d4e4..966de9372e8c 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -11,6 +11,8 @@ #include #include #include +#include + #include #include #include @@ -222,7 +224,7 @@ int __init k8_scan_nodes(void) for_each_node_mask(i, node_possible_map) { int j; - e820_register_active_regions(i, + memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); for (j = apicid_base; j < cores + apicid_base; j++) diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 18d244f70205..92faf3a1c53e 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -6,8 +6,7 @@ #include #include #include - -#include +#include static u64 patterns[] __initdata = { 0, @@ -35,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) (unsigned long long) pattern, (unsigned long long) start_bad, (unsigned long long) end_bad); - reserve_early(start_bad, end_bad, "BAD RAM"); + memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM"); } static void __init memtest(u64 pattern, u64 start_phys, u64 size) @@ -74,7 +73,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end) u64 size = 0; while (start < end) { - start = find_e820_area_size(start, &size, 1); + start = memblock_x86_find_in_range_size(start, &size, 1); /* done ? */ if (start >= end) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 809baaaf48b1..ddf9730b2061 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -120,7 +121,7 @@ int __init get_memcfg_numa_flat(void) node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; - e820_register_active_regions(0, 0, max_pfn); + memblock_x86_register_active_regions(0, 0, max_pfn); memory_present(0, 0, max_pfn); node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); @@ -161,14 +162,14 @@ static void __init allocate_pgdat(int nid) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { unsigned long pgdat_phys; - pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); memset(buf, 0, sizeof(buf)); sprintf(buf, "NODE_DATA %d", nid); - reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); + memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); } printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", nid, (unsigned long)NODE_DATA(nid)); @@ -291,15 +292,15 @@ static __init unsigned long calculate_numa_remap_pages(void) PTRS_PER_PTE); node_kva_target <<= PAGE_SHIFT; do { - node_kva_final = find_e820_area(node_kva_target, + node_kva_final = memblock_find_in_range(node_kva_target, ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); - if (node_kva_final == -1ULL) + if (node_kva_final == MEMBLOCK_ERROR) panic("Can not get kva ram\n"); node_remap_size[nid] = size; @@ -318,9 +319,9 @@ static __init unsigned long calculate_numa_remap_pages(void) * but we could have some hole in high memory, and it will only * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide * to use it as free. - * So reserve_early here, hope we don't run out of that array + * So memblock_x86_reserve_range here, hope we don't run out of that array */ - reserve_early(node_kva_final, + memblock_x86_reserve_range(node_kva_final, node_kva_final+(((u64)size)<> PAGE_SHIFT; kva_target_pfn -= PTRS_PER_PTE; - } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); + } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn); - if (kva_start_pfn == -1UL) + if (kva_start_pfn == MEMBLOCK_ERROR) panic("Can not get kva space\n"); printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", @@ -382,7 +383,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, printk(KERN_INFO "max_pfn = %lx\n", max_pfn); /* avoid clash with initrd */ - reserve_early(kva_start_pfn< physnodes[i].end) { end = physnodes[i].end; @@ -467,7 +467,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, * this one must extend to the boundary. */ if (end < dma32_end && dma32_end - end - - e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) end = dma32_end; /* @@ -476,7 +476,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, * physical node. */ if (physnodes[i].end - end - - e820_hole_size(end, physnodes[i].end) < size) + memblock_x86_hole_size(end, physnodes[i].end) < size) end = physnodes[i].end; /* @@ -504,7 +504,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) { u64 end = start + size; - while (end - start - e820_hole_size(start, end) < size) { + while (end - start - memblock_x86_hole_size(start, end) < size) { end += FAKE_NODE_MIN_SIZE; if (end > max_addr) { end = max_addr; @@ -533,7 +533,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) * creates a uniform distribution of node sizes across the entire * machine (but not necessarily over physical nodes). */ - min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / + min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / MAX_NUMNODES; min_size = max(min_size, FAKE_NODE_MIN_SIZE); if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) @@ -566,7 +566,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) * this one must extend to the boundary. */ if (end < dma32_end && dma32_end - end - - e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) end = dma32_end; /* @@ -575,7 +575,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) * physical node. */ if (physnodes[i].end - end - - e820_hole_size(end, physnodes[i].end) < size) + memblock_x86_hole_size(end, physnodes[i].end) < size) end = physnodes[i].end; /* @@ -639,7 +639,7 @@ static int __init numa_emulation(unsigned long start_pfn, */ remove_all_active_ranges(); for_each_node_mask(i, node_possible_map) { - e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, + memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); } @@ -692,7 +692,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, node_set(0, node_possible_map); for (i = 0; i < nr_cpu_ids; i++) numa_set_node(i, 0); - e820_register_active_regions(0, start_pfn, last_pfn); + memblock_x86_register_active_regions(0, start_pfn, last_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); } diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 9324f13492d5..a17dffd136c1 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -25,6 +25,7 @@ */ #include #include +#include #include #include #include @@ -264,7 +265,7 @@ int __init get_memcfg_from_srat(void) if (node_read_chunk(chunk->nid, chunk)) continue; - e820_register_active_regions(chunk->nid, chunk->start_pfn, + memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); } /* for out of order entries in SRAT */ diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index f9897f7a9ef1..7f44eb62a5e9 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -98,15 +99,15 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) unsigned long phys; length = slit->header.length; - phys = find_e820_area(0, max_pfn_mapped< x2APIC mapping */ @@ -324,7 +325,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) pxmram = 0; } - e820ram = max_pfn - (e820_hole_size(0, max_pfn<>PAGE_SHIFT); + e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<>PAGE_SHIFT); /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { printk(KERN_ERR @@ -421,7 +422,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) } for_each_node_mask(i, nodes_parsed) - e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, + memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); /* for out of order entries in SRAT */ sort_node_map(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 914f04695ce5..b511f1986911 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -1735,7 +1736,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, __xen_write_cr3(true, __pa(pgd)); xen_mc_issue(PARAVIRT_LAZY_CPU); - reserve_early(__pa(xen_start_info->pt_base), + memblock_x86_reserve_range(__pa(xen_start_info->pt_base), __pa(xen_start_info->pt_base + xen_start_info->nr_pt_frames * PAGE_SIZE), "XEN PAGETABLES"); @@ -1773,7 +1774,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); - reserve_early(__pa(xen_start_info->pt_base), + memblock_x86_reserve_range(__pa(xen_start_info->pt_base), __pa(xen_start_info->pt_base + xen_start_info->nr_pt_frames * PAGE_SIZE), "XEN PAGETABLES"); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ad0047f47cd4..2ac8f29f89cb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -61,7 +62,7 @@ char * __init xen_memory_setup(void) * - xen_start_info * See comment above "struct start_info" in */ - reserve_early(__pa(xen_start_info->mfn_list), + memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), __pa(xen_start_info->pt_base), "XEN START INFO"); diff --git a/mm/bootmem.c b/mm/bootmem.c index fda01a2c31af..13b0caa9793c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -436,7 +436,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, { #ifdef CONFIG_NO_BOOTMEM kmemleak_free_part(__va(physaddr), size); - free_early(physaddr, physaddr + size); + memblock_x86_free_range(physaddr, physaddr + size); #else unsigned long start, end; @@ -462,7 +462,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) { #ifdef CONFIG_NO_BOOTMEM kmemleak_free_part(__va(addr), size); - free_early(addr, addr + size); + memblock_x86_free_range(addr, addr + size); #else unsigned long start, end; -- cgit v1.2.3 From 6f2a75369e7561e800d86927ecd83c970996b21f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 Aug 2010 13:39:18 -0700 Subject: x86, memblock: Use memblock_memory_size()/memblock_free_memory_size() to get correct dma_reserve memblock_memory_size() will return memory size in memblock.memory.region. memblock_free_memory_size() will return free memory size in memblock.memory.region. So We can get exact reseved size in specified range. Set the size right after initmem_init(), because later bootmem API will get area above 16M. (except some fallback). Later after we remove the bootmem, We could call that just before paging_init(). Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/e820.h | 2 ++ arch/x86/kernel/e820.c | 16 ++++++++++++++++ arch/x86/kernel/setup.c | 1 + arch/x86/mm/init_64.c | 7 ------- 4 files changed, 19 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 718646384e03..5be1542fbfaf 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -117,6 +117,8 @@ extern unsigned long e820_end_of_low_ram_pfn(void); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); void memblock_x86_fill(void); +void memblock_find_dma_reserve(void); + extern void finish_e820_parsing(void); extern void e820_reserve_resources(void); extern void e820_reserve_resources_late(void); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d5fd89462d79..0c2b7ef7a34d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1105,3 +1105,19 @@ void __init memblock_x86_fill(void) memblock_analyze(); memblock_dump_all(); } + +void __init memblock_find_dma_reserve(void) +{ +#ifdef CONFIG_X86_64 + u64 free_size_pfn; + u64 mem_size_pfn; + /* + * need to find out used area below MAX_DMA_PFN + * need to use memblock to get free size in [0, MAX_DMA_PFN] + * at first, and assume boot_mem will not take below MAX_DMA_PFN + */ + mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; + free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; + set_dma_reserve(mem_size_pfn - free_size_pfn); +#endif +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a4f01733e879..924c8f78e98e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1013,6 +1013,7 @@ void __init setup_arch(char **cmdline_p) #endif initmem_init(0, max_pfn, acpi, k8); + memblock_find_dma_reserve(); #ifndef CONFIG_NO_BOOTMEM memblock_x86_to_bootmem(0, max_low_pfn< #include -static unsigned long dma_reserve __initdata; - static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -821,11 +819,6 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, reserve_bootmem(phys, len, flags); - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { - dma_reserve += len / PAGE_SIZE; - set_dma_reserve(dma_reserve); - } - return 0; } #endif -- cgit v1.2.3 From 774ea0bcb27f57b6fd521b3b6c43237782fed4b9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 Aug 2010 13:39:18 -0700 Subject: x86: Remove old bootmem code Requested by Ingo, Thomas and HPA. The old bootmem code is no longer necessary, and the transition is complete. Remove it. Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 10 +-------- arch/x86/kernel/setup.c | 4 ---- arch/x86/mm/init_32.c | 56 ------------------------------------------------- arch/x86/mm/init_64.c | 41 ------------------------------------ arch/x86/mm/memblock.c | 29 ------------------------- arch/x86/mm/numa_32.c | 3 --- arch/x86/mm/numa_64.c | 47 ----------------------------------------- 7 files changed, 1 insertion(+), 189 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 542bb2610cbb..ce07615f1cde 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -585,15 +585,7 @@ config PARAVIRT_DEBUG a paravirt_op is missing when it is called. config NO_BOOTMEM - default y - bool "Disable Bootmem code" - ---help--- - Use memblock directly instead of bootmem before slab is ready. - - allocator (buddy) [generic] - - early allocator (bootmem) [generic] - - very early allocator (memblock) [some generic] - - very very early allocator (early brk model) [x86] - So reduce one layer between early allocator to final allocator + def_bool y config MEMTEST bool "Memtest" diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 924c8f78e98e..1d114ff6a078 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1014,10 +1014,6 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn, acpi, k8); memblock_find_dma_reserve(); -#ifndef CONFIG_NO_BOOTMEM - memblock_x86_to_bootmem(0, max_low_pfn<> PAGE_SHIFT, - start_pfn, end_pfn); - printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", - nodeid, start_pfn< max_low_pfn) - continue; - if (end_pfn > max_low_pfn) - end_pfn = max_low_pfn; -#else - start_pfn = 0; - end_pfn = max_low_pfn; -#endif - bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, - bootmap); - } -#endif - after_bootmem = 1; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d6d408467c46..690b8d13971d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -572,23 +572,7 @@ kernel_physical_mapping_init(unsigned long start, void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, int acpi, int k8) { -#ifndef CONFIG_NO_BOOTMEM - unsigned long bootmap_size, bootmap; - - bootmap_size = bootmem_bootmap_pages(end_pfn)<> PAGE_SHIFT, - 0, end_pfn); memblock_x86_register_active_regions(0, start_pfn, end_pfn); - free_bootmem_with_active_regions(0, end_pfn); -#else - memblock_x86_register_active_regions(0, start_pfn, end_pfn); -#endif } #endif @@ -798,31 +782,6 @@ void mark_rodata_ro(void) #endif -#ifndef CONFIG_NO_BOOTMEM -int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, - int flags) -{ - unsigned long pfn = phys >> PAGE_SHIFT; - - if (pfn >= max_pfn) { - /* - * This can happen with kdump kernels when accessing - * firmware tables: - */ - if (pfn < max_pfn_mapped) - return -EFAULT; - - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n", - phys, len); - return -EFAULT; - } - - reserve_bootmem(phys, len, flags); - - return 0; -} -#endif - int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c index aaff3932588e..50ecbc59757f 100644 --- a/arch/x86/mm/memblock.c +++ b/arch/x86/mm/memblock.c @@ -109,7 +109,6 @@ static __init struct range *find_range_array(int count) return range; } -#ifdef CONFIG_NO_BOOTMEM static void __init memblock_x86_subtract_reserved(struct range *range, int az) { u64 final_start, final_end; @@ -182,34 +181,6 @@ int __init get_free_all_memory_range(struct range **rangep, int nodeid) *rangep = range; return nr_range; } -#else -void __init memblock_x86_to_bootmem(u64 start, u64 end) -{ - int count; - u64 final_start, final_end; - struct memblock_region *r; - - /* Take out region array itself */ - memblock_free_reserved_regions(); - - count = memblock.reserved.cnt; - memblock_dbg("(%d early reservations) ==> bootmem [%#010llx-%#010llx]\n", count, start, end - 1); - for_each_memblock(reserved, r) { - memblock_dbg(" [%#010llx-%#010llx] ", (u64)r->base, (u64)r->base + r->size - 1); - final_start = max(start, r->base); - final_end = min(end, r->base + r->size); - if (final_start >= final_end) { - memblock_dbg("\n"); - continue; - } - memblock_dbg(" ==> [%#010llx-%#010llx]\n", final_start, final_end - 1); - reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT); - } - - /* Put region array back ? */ - memblock_reserve_reserved_regions(); -} -#endif static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free) { diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index ddf9730b2061..70ddeb75ba25 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -420,9 +420,6 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, for_each_online_node(nid) { memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); NODE_DATA(nid)->node_id = nid; -#ifndef CONFIG_NO_BOOTMEM - NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; -#endif } setup_bootmem_allocator(); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 984b1ff7db44..aef0ff74f7dd 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -199,10 +199,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) unsigned long start_pfn, last_pfn, nodedata_phys; const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); int nid; -#ifndef CONFIG_NO_BOOTMEM - unsigned long bootmap_start, bootmap_pages, bootmap_size; - void *bootmap; -#endif if (!end) return; @@ -239,47 +235,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; -#ifndef CONFIG_NO_BOOTMEM - NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; - - /* - * Find a place for the bootmem map - * nodedata_phys could be on other nodes by alloc_bootmem, - * so need to sure bootmap_start not to be small, otherwise - * early_node_mem will get that with memblock_find_in_range instead - * of alloc_bootmem, that could clash with reserved range - */ - bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); - bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); - /* - * SMP_CACHE_BYTES could be enough, but init_bootmem_node like - * to use that to align to PAGE_SIZE - */ - bootmap = early_node_mem(nodeid, bootmap_start, end, - bootmap_pages<> PAGE_SHIFT, - start_pfn, last_pfn); - - printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", - bootmap_start, bootmap_start + bootmap_size - 1, - bootmap_pages); - nid = phys_to_nid(bootmap_start); - if (nid != nodeid) - printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); - - free_bootmem_with_active_regions(nodeid, end); -#endif - node_set_online(nodeid); } @@ -704,9 +659,7 @@ unsigned long __init numa_free_all_bootmem(void) for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); -#ifdef CONFIG_NO_BOOTMEM pages += free_all_memory_core_early(MAX_NUMNODES); -#endif return pages; } -- cgit v1.2.3 From 1c5f50ee347daea013671f718b70cd6bf497bef9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 3 Sep 2010 17:04:07 +0800 Subject: x86, mm: fix uninitialized addr in kernel_physical_mapping_init() This re-adds the lost chunk in commit 9b861528a80. Reported-by: Stephen Rothwell Signed-off-by: Wu Fengguang Cc: Peter Zijlstra Cc: Haicheng Li Cc: Andi Kleen LKML-Reference: <20100903090407.GA19771@localhost> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 64e7bc2ded93..74f0f358b07b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -570,6 +570,7 @@ kernel_physical_mapping_init(unsigned long start, start = (unsigned long)__va(start); end = (unsigned long)__va(end); + addr = start; for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); -- cgit v1.2.3 From 234bb549eea16ec7d5207ba747fb8dbf489e64c1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 2 Sep 2010 13:46:34 +0100 Subject: x86, cleanups: Use clear_page/copy_page rather than memset/memcpy When operating on whole pages, use clear_page() and copy_page() in favor of memset() and memcpy(); after all that's what they are intended for. Signed-off-by: Jan Beulich LKML-Reference: <4C7FB8CA0200007800013F51@vpn.id2.novell.com> Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_64.c | 4 ++-- arch/x86/kvm/lapic.c | 3 +-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 035c8c529181..b3ea9db39db6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd, if (!page) goto out; pud = (pud_t *)page_address(page); - memset(pud, 0, PAGE_SIZE); + clear_page(pud); set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); } pud = pud_offset(pgd, addr); @@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd, if (!page) goto out; pmd = (pmd_t *)page_address(page); - memset(pmd, 0, PAGE_SIZE); + clear_page(pmd); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } pmd = pmd_offset(pud, addr); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 77d8c0f4817d..22b06f7660f4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1056,14 +1056,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) vcpu->arch.apic = apic; - apic->regs_page = alloc_page(GFP_KERNEL); + apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); if (apic->regs_page == NULL) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); goto nomem_free_apic; } apic->regs = page_address(apic->regs_page); - memset(apic->regs, 0, PAGE_SIZE); apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bca79091b9d6..558f2d332076 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,7 +67,7 @@ static __init void *alloc_low_page(void) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); + clear_page(adr); return adr; } @@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE] static inline void save_pg_dir(void) { - memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); + copy_page(swsusp_pg_dir, swapper_pg_dir); } #else /* !CONFIG_ACPI_SLEEP */ static inline void save_pg_dir(void) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a6674689a20..7c48ad4faca3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -293,7 +293,7 @@ static __ref void *alloc_low_page(unsigned long *phys) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); + clear_page(adr); *phys = pfn * PAGE_SIZE; return adr; } -- cgit v1.2.3 From 44235dcde416104b8e1db7606c283f4c0149c760 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 14 Oct 2010 17:04:59 -0700 Subject: x86, mm: Fix bogus whitespace in sync_global_pgds() Whitespace cleanup only. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 74f0f358b07b..1ad7c0ff5d2b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -103,28 +103,28 @@ __setup("noexec32=", nonx32_setup); */ void sync_global_pgds(unsigned long start, unsigned long end) { - unsigned long address; - - for (address = start; address <= end; address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) - != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } } /* -- cgit v1.2.3 From 617d34d9e5d8326ec8f188c616aa06ac59d083fe Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 21 Sep 2010 12:01:51 -0700 Subject: x86, mm: Hold mm->page_table_lock while doing vmalloc_sync Take mm->page_table_lock while syncing the vmalloc region. This prevents a race with the Xen pagetable pin/unpin code, which expects that the page_table_lock is already held. If this race occurs, then Xen can see an inconsistent page type (a page can either be read/write or a pagetable page, and pin/unpin converts it between them), which will cause either the pin or the set_p[gm]d to fail; either will crash the kernel. vmalloc_sync_all() should be called rarely, so this extra use of page_table_lock should not interfere with its normal users. The mm pointer is stashed in the pgd page's index field, as that won't be otherwise used for pgds. Reported-by: Ian Campbell Originally-by: Jan Beulich LKML-Reference: <4CB88A4C.1080305@goop.org> Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 2 ++ arch/x86/mm/fault.c | 11 ++++++++++- arch/x86/mm/init_64.c | 7 +++++++ arch/x86/mm/pgtable.c | 20 +++++++++++++++++--- 4 files changed, 36 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm/init_64.c') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2d0a33bd2971..ada823a13c7c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; extern spinlock_t pgd_lock; extern struct list_head pgd_list; +extern struct mm_struct *pgd_page_get_mm(struct page *page); + #ifdef CONFIG_PARAVIRT #include #else /* !CONFIG_PARAVIRT */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index caec22906d7c..6c27c39f8a37 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -229,7 +229,16 @@ void vmalloc_sync_all(void) spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) + spinlock_t *pgt_lock; + int ret; + + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + + spin_lock(pgt_lock); + ret = vmalloc_sync_one(page_address(page), address); + spin_unlock(pgt_lock); + + if (!ret) break; } spin_unlock_irqrestore(&pgd_lock, flags); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ad7c0ff5d2b..4d323fb770c2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -116,12 +116,19 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + spinlock_t *pgt_lock; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + spin_unlock(pgt_lock); } spin_unlock_irqrestore(&pgd_lock, flags); } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..c70e57dbb491 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(pgd_t *pgd) + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); + virt_to_page(pgd)->index = (pgoff_t)mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return (struct mm_struct *)page->index; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the @@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd) } /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); pgd_list_add(pgd); + } } static void pgd_dtor(pgd_t *pgd) @@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) */ spin_lock_irqsave(&pgd_lock, flags); - pgd_ctor(pgd); + pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); spin_unlock_irqrestore(&pgd_lock, flags); -- cgit v1.2.3