From 1842f90cc98625d4d9bf8f8b927f17705ceb4e9c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:15 -0800 Subject: x86: Call early_res_to_bootmem one time Simplify setup_node_mem: don't use bootmem from other node, instead just find_e820_area in early_node_mem. This keeps the boundary between early_res and boot mem more clear, and lets us only call early_res_to_bootmem() one time instead of for all nodes. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-12-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 62 +++++++++++++++++---------------------------------- 1 file changed, 20 insertions(+), 42 deletions(-) (limited to 'arch/x86/mm/numa_64.c') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 83bbc70d11bb..3232148756ce 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -164,18 +164,21 @@ static void * __init early_node_mem(int nodeid, unsigned long start, unsigned long align) { unsigned long mem = find_e820_area(start, end, size, align); - void *ptr; if (mem != -1L) return __va(mem); - ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); - if (ptr == NULL) { - printk(KERN_ERR "Cannot find %lu bytes in node %d\n", + + start = __pa(MAX_DMA_ADDRESS); + end = max_low_pfn_mapped << PAGE_SHIFT; + mem = find_e820_area(start, end, size, align); + if (mem != -1L) + return __va(mem); + + printk(KERN_ERR "Cannot find %lu bytes in node %d\n", size, nodeid); - return NULL; - } - return ptr; + + return NULL; } /* Initialize bootmem allocator for a node */ @@ -211,8 +214,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) if (node_data[nodeid] == NULL) return; nodedata_phys = __pa(node_data[nodeid]); + reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, nodedata_phys + pgdat_size - 1); + nid = phys_to_nid(nodedata_phys); + if (nid != nodeid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; @@ -227,11 +234,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) * of alloc_bootmem, that could clash with reserved range */ bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); - nid = phys_to_nid(nodedata_phys); - if (nid == nodeid) - bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); - else - bootmap_start = roundup(start, PAGE_SIZE); + bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); /* * SMP_CACHE_BYTES could be enough, but init_bootmem_node like * to use that to align to PAGE_SIZE @@ -239,18 +242,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) bootmap = early_node_mem(nodeid, bootmap_start, end, bootmap_pages<= end) { - /* - * only need to free it if it is from other node - * bootmem - */ - if (nid != nodeid) - free_bootmem(nodedata_phys, pgdat_size); - } + free_early(nodedata_phys, nodedata_phys + pgdat_size); node_data[nodeid] = NULL; return; } bootmap_start = __pa(bootmap); + reserve_early(bootmap_start, bootmap_start+(bootmap_pages<> PAGE_SHIFT, @@ -259,31 +257,11 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", bootmap_start, bootmap_start + bootmap_size - 1, bootmap_pages); - - free_bootmem_with_active_regions(nodeid, end); - - /* - * convert early reserve to bootmem reserve earlier - * otherwise early_node_mem could use early reserved mem - * on previous node - */ - early_res_to_bootmem(start, end); - - /* - * in some case early_node_mem could use alloc_bootmem - * to get range on other node, don't reserve that again - */ - if (nid != nodeid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); - else - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, - pgdat_size, BOOTMEM_DEFAULT); nid = phys_to_nid(bootmap_start); if (nid != nodeid) printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); - else - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, - bootmap_pages< Date: Wed, 10 Feb 2010 01:20:18 -0800 Subject: x86: Make early_node_mem get mem > 4 GB if possible So we could put pgdata for the node high, and later sparse vmmap will get the section nr that need. With this patch will make <4 GB ram not use a sparse vmmap. before this patch, will get, before swiotlb try get bootmem [ 0.000000] nid=1 start=0 end=2080000 aligned=1 [ 0.000000] free [10 - 96] [ 0.000000] free [b12 - 1000] [ 0.000000] free [359f - 38a3] [ 0.000000] free [38b5 - 3a00] [ 0.000000] free [41e01 - 42000] [ 0.000000] free [73dde - 73e00] [ 0.000000] free [73fdd - 74000] [ 0.000000] free [741dd - 74200] [ 0.000000] free [743dd - 74400] [ 0.000000] free [745dd - 74600] [ 0.000000] free [747dd - 74800] [ 0.000000] free [749dd - 74a00] [ 0.000000] free [74bdd - 74c00] [ 0.000000] free [74ddd - 74e00] [ 0.000000] free [74fdd - 75000] [ 0.000000] free [751dd - 75200] [ 0.000000] free [753dd - 75400] [ 0.000000] free [755dd - 75600] [ 0.000000] free [757dd - 75800] [ 0.000000] free [759dd - 75a00] [ 0.000000] free [75bdd - 7bf5f] [ 0.000000] free [7f730 - 7f750] [ 0.000000] free [100000 - 2080000] [ 0.000000] total free 1f87170 [ 93.301474] Placing 64MB software IO TLB between ffff880075bdd000 - ffff880079bdd000 [ 93.311814] software IO TLB at phys 0x75bdd000 - 0x79bdd000 with this patch will get: before swiotlb try get bootmem [ 0.000000] nid=1 start=0 end=2080000 aligned=1 [ 0.000000] free [a - 96] [ 0.000000] free [702 - 1000] [ 0.000000] free [359f - 3600] [ 0.000000] free [37de - 3800] [ 0.000000] free [39dd - 3a00] [ 0.000000] free [3bdd - 3c00] [ 0.000000] free [3ddd - 3e00] [ 0.000000] free [3fdd - 4000] [ 0.000000] free [41dd - 4200] [ 0.000000] free [43dd - 4400] [ 0.000000] free [45dd - 4600] [ 0.000000] free [47dd - 4800] [ 0.000000] free [49dd - 4a00] [ 0.000000] free [4bdd - 4c00] [ 0.000000] free [4ddd - 4e00] [ 0.000000] free [4fdd - 5000] [ 0.000000] free [51dd - 5200] [ 0.000000] free [53dd - 5400] [ 0.000000] free [55dd - 7bf5f] [ 0.000000] free [7f730 - 7f750] [ 0.000000] free [100428 - 100600] [ 0.000000] free [13ea01 - 13ec00] [ 0.000000] free [170800 - 2080000] [ 0.000000] total free 1f87170 [ 92.689485] PCI-DMA: Using software bounce buffering for IO (SWIOTLB) [ 92.699799] Placing 64MB software IO TLB between ffff8800055dd000 - ffff8800095dd000 [ 92.710916] software IO TLB at phys 0x55dd000 - 0x95dd000 so will get enough space below 4G, aka pfn 0x100000 Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-15-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm/numa_64.c') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3232148756ce..02f13cb99bc2 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -163,14 +163,27 @@ static void * __init early_node_mem(int nodeid, unsigned long start, unsigned long end, unsigned long size, unsigned long align) { - unsigned long mem = find_e820_area(start, end, size, align); + unsigned long mem; + /* + * put it on high as possible + * something will go with NODE_DATA + */ + if (start < (MAX_DMA_PFN< (MAX_DMA32_PFN< (MAX_DMA32_PFN< Date: Wed, 10 Feb 2010 01:20:20 -0800 Subject: x86: Make 64 bit use early_res instead of bootmem before slab Finally we can use early_res to replace bootmem for x86_64 now. Still can use CONFIG_NO_BOOTMEM to enable it or not. -v2: fix 32bit compiling about MAX_DMA32_PFN -v3: folded bug fix from LKML message below Signed-off-by: Yinghai Lu LKML-Reference: <4B747239.4070907@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 13 +++ arch/x86/include/asm/e820.h | 6 ++ arch/x86/kernel/e820.c | 159 +++++++++++++++++++++++++++++++++--- arch/x86/kernel/setup.c | 2 + arch/x86/mm/init_64.c | 4 + arch/x86/mm/numa_64.c | 20 +++-- include/linux/bootmem.h | 7 ++ include/linux/mm.h | 5 ++ include/linux/mmzone.h | 2 + mm/bootmem.c | 195 +++++++++++++++++++++++++++++++++++++++++++- mm/page_alloc.c | 59 +++++++++++++- mm/percpu.c | 3 + mm/sparse-vmemmap.c | 2 +- 13 files changed, 454 insertions(+), 23 deletions(-) (limited to 'arch/x86/mm/numa_64.c') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb4092568f9e..95439843cebc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -568,6 +568,19 @@ config PARAVIRT_DEBUG Enable to debug paravirt_ops internals. Specifically, BUG if a paravirt_op is missing when it is called. +config NO_BOOTMEM + default y + bool "Disable Bootmem code" + depends on X86_64 + ---help--- + Use early_res directly instead of bootmem before slab is ready. + - allocator (buddy) [generic] + - early allocator (bootmem) [generic] + - very early allocator (reserve_early*()) [x86] + - very very early allocator (early brk model) [x86] + So reduce one layer between early allocator to final allocator + + config MEMTEST bool "Memtest" ---help--- diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 761249e396fe..7d72e5fb7008 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -117,6 +117,12 @@ extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); +void reserve_early_without_check(u64 start, u64 end, char *name); +u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align); +#include +int get_free_all_memory_range(struct range **rangep, int nodeid); + extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); extern int e820_find_active_region(const struct e820entry *ei, diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e09c18c8f3c1..90a85295f332 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -977,6 +977,25 @@ void __init reserve_early(u64 start, u64 end, char *name) __reserve_early(start, end, name, 0); } +void __init reserve_early_without_check(u64 start, u64 end, char *name) +{ + struct early_res *r; + + if (start >= end) + return; + + __check_and_double_early_res(end); + + r = &early_res[early_res_count]; + + r->start = start; + r->end = end; + r->overlap_ok = 0; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + void __init free_early(u64 start, u64 end) { struct early_res *r; @@ -991,6 +1010,94 @@ void __init free_early(u64 start, u64 end) drop_range(i); } +#ifdef CONFIG_NO_BOOTMEM +static void __init subtract_early_res(struct range *range, int az) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + +#if 1 + printk(KERN_INFO "Subtract (%d early reservations)\n", count); +#endif + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; +#if 0 + printk(KERN_INFO " #%d [%010llx - %010llx] %15s", i, + r->start, r->end, r->name); +#endif + final_start = PFN_DOWN(r->start); + final_end = PFN_UP(r->end); + if (final_start >= final_end) { +#if 0 + printk(KERN_CONT "\n"); +#endif + continue; + } +#if 0 + printk(KERN_CONT " subtract pfn [%010llx - %010llx]\n", + final_start, final_end); +#endif + subtract_range(range, az, final_start, final_end); + } + +} + +int __init get_free_all_memory_range(struct range **rangep, int nodeid) +{ + int i, count; + u64 start = 0, end; + u64 size; + u64 mem; + struct range *range; + int nr_range; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + count *= 2; + + size = sizeof(struct range) * count; +#ifdef MAX_DMA32_PFN + if (max_pfn_mapped > MAX_DMA32_PFN) + start = MAX_DMA32_PFN << PAGE_SHIFT; +#endif + end = max_pfn_mapped << PAGE_SHIFT; + mem = find_e820_area(start, end, size, sizeof(struct range)); + if (mem == -1ULL) + panic("can not find more space for range free"); + + range = __va(mem); + /* use early_node_map[] and early_res to get range array at first */ + memset(range, 0, size); + nr_range = 0; + + /* need to go over early_node_map to find out good range for node */ + nr_range = add_from_early_node_map(range, count, nr_range, nodeid); + subtract_early_res(range, count); + nr_range = clean_sort_range(range, count); + + /* need to clear it ? */ + if (nodeid == MAX_NUMNODES) { + memset(&early_res[0], 0, + sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + } + + *rangep = range; + return nr_range; +} +#else void __init early_res_to_bootmem(u64 start, u64 end) { int i, count; @@ -1028,6 +1135,7 @@ void __init early_res_to_bootmem(u64 start, u64 end) max_early_res = 0; early_res_count = 0; } +#endif /* Check for already reserved areas */ static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) @@ -1081,6 +1189,35 @@ again: return changed; } +/* + * Find a free area with specified alignment in a specific range. + * only with the area.between start to end is active range from early_node_map + * so they are good as RAM + */ +u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + goto out; + if (last > end) + goto out; + + return addr; + +out: + return -1ULL; +} + /* * Find a free area with specified alignment in a specific range. */ @@ -1090,24 +1227,20 @@ u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; + u64 addr; + u64 ei_start, ei_last; if (ei->type != E820_RAM) continue; - addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - continue; - if (last > end) + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); + + if (addr == -1ULL) continue; + return addr; } return -1ULL; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ea4141b48518..d49e168bda8c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -967,7 +967,9 @@ void __init setup_arch(char **cmdline_p) #endif initmem_init(0, max_pfn, acpi, k8); +#ifndef CONFIG_NO_BOOTMEM early_res_to_bootmem(0, max_low_pfn<> PAGE_SHIFT; @@ -235,10 +237,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; + NODE_DATA(nodeid)->node_id = nodeid; NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; +#ifndef CONFIG_NO_BOOTMEM + NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; + /* * Find a place for the bootmem map * nodedata_phys could be on other nodes by alloc_bootmem, @@ -275,6 +280,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); free_bootmem_with_active_regions(nodeid, end); +#endif node_set_online(nodeid); } @@ -733,6 +739,10 @@ unsigned long __init numa_free_all_bootmem(void) for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); +#ifdef CONFIG_NO_BOOTMEM + pages += free_all_memory_core_early(MAX_NUMNODES); +#endif + return pages; } diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index b10ec49ee2dd..266ab9291232 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -23,6 +23,7 @@ extern unsigned long max_pfn; extern unsigned long saved_max_pfn; #endif +#ifndef CONFIG_NO_BOOTMEM /* * node_bootmem_map is a map pointer - the bits represent all physical * memory pages (including holes) on the node. @@ -37,6 +38,7 @@ typedef struct bootmem_data { } bootmem_data_t; extern bootmem_data_t bootmem_node_data[]; +#endif extern unsigned long bootmem_bootmap_pages(unsigned long); @@ -46,6 +48,7 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat, unsigned long endpfn); extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); +unsigned long free_all_memory_core_early(int nodeid); extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); extern unsigned long free_all_bootmem(void); @@ -84,6 +87,10 @@ extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); +void *__alloc_bootmem_node_high(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal); extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b2fa8593c61..f2c5b3cee8a1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -12,6 +12,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1049,6 +1050,10 @@ extern void get_pfn_range_for_nid(unsigned int nid, extern unsigned long find_min_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); +int add_from_early_node_map(struct range *range, int az, + int nr_range, int nid); +void *__alloc_memory_core_early(int nodeid, u64 size, u64 align, + u64 goal, u64 limit); typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 30fe668c2542..eae8387b6007 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -620,7 +620,9 @@ typedef struct pglist_data { struct page_cgroup *node_page_cgroup; #endif #endif +#ifndef CONFIG_NO_BOOTMEM struct bootmem_data *bdata; +#endif #ifdef CONFIG_MEMORY_HOTPLUG /* * Must be held any time you expect node_start_pfn, node_present_pages diff --git a/mm/bootmem.c b/mm/bootmem.c index 7d1486875e1c..d7c791ef0036 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,7 @@ unsigned long max_pfn; unsigned long saved_max_pfn; #endif +#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } - +#endif /* * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range @@ -167,6 +169,60 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) } } +#ifdef CONFIG_NO_BOOTMEM +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { +#if 1 + printk(KERN_DEBUG " %lx - %lx\n", start, end); +#endif + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + +#if 1 + printk(KERN_DEBUG " %lx %lx - %lx %lx\n", + start, start_aligned, end_aligned, end); +#endif + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} +#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -227,6 +283,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } +#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -237,7 +294,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); +#ifdef CONFIG_NO_BOOTMEM + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +#else return free_all_bootmem_core(pgdat->bdata); +#endif } /** @@ -247,9 +309,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { +#ifdef CONFIG_NO_BOOTMEM + return free_all_memory_core_early(NODE_DATA(0)->node_id); +#else return free_all_bootmem_core(NODE_DATA(0)->bdata); +#endif } +#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -344,6 +411,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } +#endif /** * free_bootmem_node - mark a page range as usable @@ -358,6 +426,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + free_early(physaddr, physaddr + size); +#if 0 + printk(KERN_DEBUG "free %lx %lx\n", physaddr, size); +#endif +#else unsigned long start, end; kmemleak_free_part(__va(physaddr), size); @@ -366,6 +440,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); +#endif } /** @@ -379,6 +454,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + free_early(addr, addr + size); +#if 0 + printk(KERN_DEBUG "free %lx %lx\n", addr, size); +#endif +#else unsigned long start, end; kmemleak_free_part(__va(addr), size); @@ -387,6 +468,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); +#endif } /** @@ -403,12 +485,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); +#endif } /** @@ -424,14 +511,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); +#endif } +#ifndef CONFIG_NO_BOOTMEM static unsigned long __init align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) { @@ -582,12 +675,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } +#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { +#ifdef CONFIG_NO_BOOTMEM + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +#else bootmem_data_t *bdata; void *region; @@ -613,6 +727,7 @@ restart: } return NULL; +#endif } /** @@ -631,7 +746,13 @@ restart: void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem_nopanic(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem_nopanic(size, align, goal, limit); } static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, @@ -665,9 +786,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem(size, align, goal, limit); } +#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -684,6 +812,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } +#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -706,7 +835,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + return __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); +#else return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); +#endif +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + new_goal, -1ULL); +#else + ptr = alloc_bootmem_core(pgdat->bdata, size, align, + new_goal, 0); +#endif + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + } #ifdef CONFIG_SPARSEMEM @@ -720,6 +888,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { +#ifdef CONFIG_NO_BOOTMEM + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -729,6 +907,7 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); +#endif } #endif @@ -740,11 +919,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); +#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); +#endif if (ptr) return ptr; @@ -795,6 +979,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + return __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +#else return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +#endif } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8deb9d0fd5b1..78821a28e394 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3435,6 +3435,59 @@ void __init free_bootmem_with_active_regions(int nid, } } +int __init add_from_early_node_map(struct range *range, int az, + int nr_range, int nid) +{ + int i; + u64 start, end; + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + start = early_node_map[i].start_pfn; + end = early_node_map[i].end_pfn; + nr_range = add_range(range, az, nr_range, start, end); + } + return nr_range; +} + +void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + int i; + void *ptr; + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + u64 addr; + u64 ei_start, ei_last; + + ei_last = early_node_map[i].end_pfn; + ei_last <<= PAGE_SHIFT; + ei_start = early_node_map[i].start_pfn; + ei_start <<= PAGE_SHIFT; + addr = find_early_area(ei_start, ei_last, + goal, limit, size, align); + + if (addr == -1ULL) + continue; + +#if 0 + printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", + nid, + ei_start, ei_last, goal, limit, size, + align, addr); +#endif + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + reserve_early_without_check(addr, addr + size, "BOOTMEM"); + return ptr; + } + + return NULL; +} + + void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -4467,7 +4520,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; +struct pglist_data __refdata contig_page_data = { +#ifndef CONFIG_NO_BOOTMEM + .bdata = &bootmem_node_data[0] +#endif + }; EXPORT_SYMBOL(contig_page_data); #endif diff --git a/mm/percpu.c b/mm/percpu.c index 083e7c91e5f6..841defeeef86 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1929,7 +1929,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, } /* copy and return the unused part */ memcpy(ptr, __per_cpu_load, ai->static_size); +#ifndef CONFIG_NO_BOOTMEM + /* fix partial free ! */ free_fn(ptr + size_sum, ai->unit_size - size_sum); +#endif } } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d9714bdcb4a3..9506c39942f6 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -40,7 +40,7 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); + return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); } -- cgit v1.2.3 From 68fd111e02b979876359c7b471a8bcbca0628b75 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 15 Feb 2010 13:43:25 -0800 Subject: x86, numa: Fix numa emulation calculation of big nodes numa=fake=N uses split_nodes_interleave() to partition the system into N fake nodes. Each node size must have be a multiple of FAKE_NODE_MIN_SIZE, otherwise it is possible to get strange alignments. Because of this, the remaining memory from each node when rounded to FAKE_NODE_MIN_SIZE is consolidated into a number of "big nodes" that are bigger than the rest. The calculation of the number of big nodes is incorrect since it is using a logical AND operator when it should be multiplying the rounded-off portion of each node with N. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/numa_64.c') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 83bbc70d11bb..2ecbe0ca0dfc 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -427,7 +427,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, * Calculate the number of big nodes that can be allocated as a result * of consolidating the remainder. */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / FAKE_NODE_MIN_SIZE; size &= FAKE_NODE_MIN_HASH_MASK; -- cgit v1.2.3 From 8df5bb34defd685fe86f60746bbf3d47d1c6f033 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 15 Feb 2010 13:43:30 -0800 Subject: x86, numa: Add fixed node size option for numa emulation numa=fake=N specifies the number of fake nodes, N, to partition the system into and then allocates them by interleaving over physical nodes. This requires knowledge of the system capacity when attempting to allocate nodes of a certain size: either very large nodes to benchmark scalability of code that operates on individual nodes, or very small nodes to find bugs in the VM. This patch introduces numa=fake=[MG] so it is possible to specify the size of each node to allocate. When used, nodes of the size specified will be allocated and interleaved over the set of physical nodes. FAKE_NODE_MIN_SIZE was also moved to the more-appropriate include/asm/numa_64.h. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- Documentation/x86/x86_64/boot-options.txt | 4 + arch/x86/include/asm/mmzone_64.h | 6 -- arch/x86/include/asm/numa_64.h | 5 ++ arch/x86/mm/numa_64.c | 117 ++++++++++++++++++++++++++++-- 4 files changed, 118 insertions(+), 14 deletions(-) (limited to 'arch/x86/mm/numa_64.c') diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 29a6ff8bc7d3..01150c64aa73 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -166,6 +166,10 @@ NUMA numa=noacpi Don't parse the SRAT table for NUMA setup + numa=fake=[MG] + If given as a memory unit, fills all system RAM with nodes of + size interleaved over physical nodes. + numa=fake=CMDLINE If a number, fakes CMDLINE nodes and ignores NUMA setup of the actual machine. Otherwise, system memory is configured diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index a29f48c2a322..288b96f815a6 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) - -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -#endif - #endif #endif /* _ASM_X86_MMZONE_64_H */ diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index c4ae822e415f..823e070e7c26 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); + +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE ((u64)64 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) +#endif /* CONFIG_NUMA_EMU */ #else static inline void init_cpu_to_node(void) { } static inline void numa_set_node(int cpu, int node) { } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 2ecbe0ca0dfc..c47c78ba3aca 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -501,6 +501,102 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, return ret; } +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - e820_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. The return value is the number of nodes allocated. + */ +static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 min_size; + int ret = 0; + int i; + + if (!size) + return -1; + /* + * The limit on emulated nodes is MAX_NUMNODES, so the size per node is + * increased accordingly if the requested size is too small. This + * creates a uniform distribution of node sizes across the entire + * machine (but not necessarily over physical nodes). + */ + min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / + MAX_NUMNODES; + min_size = max(min_size, FAKE_NODE_MIN_SIZE); + if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) + min_size = (min_size + FAKE_NODE_MIN_SIZE) & + FAKE_NODE_MIN_HASH_MASK; + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size &= FAKE_NODE_MIN_HASH_MASK; + + for (i = 0; i < MAX_NUMNODES; i++) + if (physnodes[i].start != physnodes[i].end) + node_set(i, physnode_mask); + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; + u64 end; + + end = find_end_of_node(physnodes[i].start, + physnodes[i].end, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (physnodes[i].end - end - + e820_hole_size(end, physnodes[i].end) < size) + end = physnodes[i].end; + + /* + * Setup the fake node that will be allocated as bootmem + * later. If setup_node_range() returns non-zero, there + * is no more memory available on this physical node. + */ + if (setup_node_range(ret++, &physnodes[i].start, + end - physnodes[i].start, + physnodes[i].end) < 0) + node_clear(i, physnode_mask); + } + } + return ret; +} + /* * Splits num_nodes nodes up equally starting at node_start. The return value * is the number of nodes split up and addr is adjusted to be at the end of the @@ -546,14 +642,7 @@ static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, if (i == num_nodes + node_start - 1) end = max_addr; else - while (end - *addr - e820_hole_size(*addr, end) < - size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } + end = find_end_of_node(*addr, max_addr, size); if (setup_node_range(i, addr, end - *addr, max_addr) < 0) break; } @@ -588,6 +677,18 @@ static int __init numa_emulation(unsigned long start_pfn, int num_phys_nodes; num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. + */ + if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { + size = memparse(cmdline, &cmdline); + num_nodes = split_nodes_size_interleave(addr, max_addr, size); + if (num_nodes < 0) + return num_nodes; + goto out; + } + /* * If the numa=fake command-line is just a single number N, split the * system RAM into N fake nodes. -- cgit v1.2.3 From ca2107c9d6cf44fb915402d6f12b9d9ff3925cd7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 15 Feb 2010 13:43:33 -0800 Subject: x86, numa: Remove configurable node size support for numa emulation Now that numa=fake=[MG] is implemented, it is possible to remove configurable node size support. The command-line parsing was already broken (numa=fake=*128, for example, would not work) and since fake nodes are now interleaved over physical nodes, this support is no longer required. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- Documentation/x86/x86_64/boot-options.txt | 16 +-- arch/x86/mm/numa_64.c | 160 +++--------------------------- 2 files changed, 16 insertions(+), 160 deletions(-) (limited to 'arch/x86/mm/numa_64.c') diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 01150c64aa73..7fbbaf85f5b7 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -170,19 +170,9 @@ NUMA If given as a memory unit, fills all system RAM with nodes of size interleaved over physical nodes. - numa=fake=CMDLINE - If a number, fakes CMDLINE nodes and ignores NUMA setup of the - actual machine. Otherwise, system memory is configured - depending on the sizes and coefficients listed. For example: - numa=fake=2*512,1024,4*256,*128 - gives two 512M nodes, a 1024M node, four 256M nodes, and the - rest split into 128M chunks. If the last character of CMDLINE - is a *, the remaining memory is divided up equally among its - coefficient: - numa=fake=2*512,2* - gives two 512M nodes and the rest split into two nodes. - Otherwise, the remaining system RAM is allocated to an - additional node. + numa=fake= + If given as an integer, fills all system RAM with N fake nodes + interleaved over physical nodes. ACPI diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c47c78ba3aca..3307ea8bd43a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -597,73 +597,6 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) return ret; } -/* - * Splits num_nodes nodes up equally starting at node_start. The return value - * is the number of nodes split up and addr is adjusted to be at the end of the - * last node allocated. - */ -static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, - int num_nodes) -{ - unsigned int big; - u64 size; - int i; - - if (num_nodes <= 0) - return -1; - if (num_nodes > MAX_NUMNODES) - num_nodes = MAX_NUMNODES; - size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / - num_nodes; - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the leftovers. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / - FAKE_NODE_MIN_SIZE; - - /* Round down to nearest FAKE_NODE_MIN_SIZE. */ - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - printk(KERN_ERR "Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - for (i = node_start; i < num_nodes + node_start; i++) { - u64 end = *addr + size; - - if (i < big) - end += FAKE_NODE_MIN_SIZE; - /* - * The final node can have the remaining system RAM. Other - * nodes receive roughly the same amount of available pages. - */ - if (i == num_nodes + node_start - 1) - end = max_addr; - else - end = find_end_of_node(*addr, max_addr, size); - if (setup_node_range(i, addr, end - *addr, max_addr) < 0) - break; - } - return i - node_start + 1; -} - -/* - * Splits the remaining system RAM into chunks of size. The remaining memory is - * always assigned to a final node and can be asymmetric. Returns the number of - * nodes split. - */ -static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, - u64 size) -{ - int i = node_start; - size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, addr, size, max_addr)) - ; - return i - node_start; -} - /* * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. @@ -671,99 +604,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn, int acpi, int k8) { - u64 size, addr = start_pfn << PAGE_SHIFT; + u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; - int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; int num_phys_nodes; + int num_nodes; + int i; num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); /* * If the numa=fake command-line contains a 'M' or 'G', it represents - * the fixed node size. + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. */ if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { + u64 size; + size = memparse(cmdline, &cmdline); num_nodes = split_nodes_size_interleave(addr, max_addr, size); - if (num_nodes < 0) - return num_nodes; - goto out; - } + } else { + unsigned long n; - /* - * If the numa=fake command-line is just a single number N, split the - * system RAM into N fake nodes. - */ - if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - long n = simple_strtol(cmdline, NULL, 0); - - num_nodes = split_nodes_interleave(addr, max_addr, - num_phys_nodes, n); - if (num_nodes < 0) - return num_nodes; - goto out; + n = simple_strtoul(cmdline, NULL, 0); + num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); } - /* Parse the command line. */ - for (coeff_flag = 0; ; cmdline++) { - if (*cmdline && isdigit(*cmdline)) { - num = num * 10 + *cmdline - '0'; - continue; - } - if (*cmdline == '*') { - if (num > 0) - coeff = num; - coeff_flag = 1; - } - if (!*cmdline || *cmdline == ',') { - if (!coeff_flag) - coeff = 1; - /* - * Round down to the nearest FAKE_NODE_MIN_SIZE. - * Command-line coefficients are in megabytes. - */ - size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; - if (size) - for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, &addr, - size, max_addr) < 0) - goto done; - if (!*cmdline) - break; - coeff_flag = 0; - coeff = -1; - } - num = 0; - } -done: - if (!num_nodes) - return -1; - /* Fill remainder of system RAM, if appropriate. */ - if (addr < max_addr) { - if (coeff_flag && coeff < 0) { - /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(&addr, max_addr, - num_nodes, num); - goto out; - } - switch (*(cmdline - 1)) { - case '*': - /* Split remaining nodes into coeff chunks */ - if (coeff <= 0) - break; - num_nodes += split_nodes_equally(&addr, max_addr, - num_nodes, coeff); - break; - case ',': - /* Do not allocate remaining system RAM */ - break; - default: - /* Give one final node */ - setup_node_range(num_nodes, &addr, max_addr - addr, - max_addr); - num_nodes++; - } - } -out: + if (num_nodes < 0) + return num_nodes; memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); if (memnode_shift < 0) { memnode_shift = 0; -- cgit v1.2.3