diff options
author | Yinghai Lu <yinghai@kernel.org> | 2013-03-01 23:51:27 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-03-02 18:34:39 +0100 |
commit | 20e6926dcbafa1b361f1c29d967688be14b6ca4b (patch) | |
tree | c5ea7011124c5c1a476c43484a6072702c178edc /arch | |
parent | Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/vir... (diff) | |
download | linux-20e6926dcbafa1b361f1c29d967688be14b6ca4b.tar.xz linux-20e6926dcbafa1b361f1c29d967688be14b6ca4b.zip |
x86, ACPI, mm: Revert movablemem_map support
Tim found:
WARNING: at arch/x86/kernel/smpboot.c:324 topology_sane.isra.2+0x6f/0x80()
Hardware name: S2600CP
sched: CPU #1's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
smpboot: Booting Node 1, Processors #1
Modules linked in:
Pid: 0, comm: swapper/1 Not tainted 3.9.0-0-generic #1
Call Trace:
set_cpu_sibling_map+0x279/0x449
start_secondary+0x11d/0x1e5
Don Morris reproduced on a HP z620 workstation, and bisected it to
commit e8d195525809 ("acpi, memory-hotplug: parse SRAT before memblock
is ready")
It turns out movable_map has some problems, and it breaks several things
1. numa_init is called several times, NOT just for srat. so those
nodes_clear(numa_nodes_parsed)
memset(&numa_meminfo, 0, sizeof(numa_meminfo))
can not be just removed. Need to consider sequence is: numaq, srat, amd, dummy.
and make fall back path working.
2. simply split acpi_numa_init to early_parse_srat.
a. that early_parse_srat is NOT called for ia64, so you break ia64.
b. for (i = 0; i < MAX_LOCAL_APIC; i++)
set_apicid_to_node(i, NUMA_NO_NODE)
still left in numa_init. So it will just clear result from early_parse_srat.
it should be moved before that....
c. it breaks ACPI_TABLE_OVERIDE...as the acpi table scan is moved
early before override from INITRD is settled.
3. that patch TITLE is total misleading, there is NO x86 in the title,
but it changes critical x86 code. It caused x86 guys did not
pay attention to find the problem early. Those patches really should
be routed via tip/x86/mm.
4. after that commit, following range can not use movable ram:
a. real_mode code.... well..funny, legacy Node0 [0,1M) could be hot-removed?
b. initrd... it will be freed after booting, so it could be on movable...
c. crashkernel for kdump...: looks like we can not put kdump kernel above 4G
anymore.
d. init_mem_mapping: can not put page table high anymore.
e. initmem_init: vmemmap can not be high local node anymore. That is
not good.
If node is hotplugable, the mem related range like page table and
vmemmap could be on the that node without problem and should be on that
node.
We have workaround patch that could fix some problems, but some can not
be fixed.
So just remove that offending commit and related ones including:
f7210e6c4ac7 ("mm/memblock.c: use CONFIG_HAVE_MEMBLOCK_NODE_MAP to
protect movablecore_map in memblock_overlaps_region().")
01a178a94e8e ("acpi, memory-hotplug: support getting hotplug info from
SRAT")
27168d38fa20 ("acpi, memory-hotplug: extend movablemem_map ranges to
the end of node")
e8d195525809 ("acpi, memory-hotplug: parse SRAT before memblock is
ready")
fb06bc8e5f42 ("page_alloc: bootmem limit with movablecore_map")
42f47e27e761 ("page_alloc: make movablemem_map have higher priority")
6981ec31146c ("page_alloc: introduce zone_movable_limit[] to keep
movable limit for nodes")
34b71f1e04fc ("page_alloc: add movable_memmap kernel parameter")
4d59a75125d5 ("x86: get pg_data_t's memory from other node")
Later we should have patches that will make sure kernel put page table
and vmemmap on local node ram instead of push them down to node0. Also
need to find way to put other kernel used ram to local node ram.
Reported-by: Tim Gardner <tim.gardner@canonical.com>
Reported-by: Don Morris <don.morris@hp.com>
Bisected-by: Don Morris <don.morris@hp.com>
Tested-by: Don Morris <don.morris@hp.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/kernel/setup.c | 13 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 11 | ||||
-rw-r--r-- | arch/x86/mm/srat.c | 125 |
3 files changed, 12 insertions, 137 deletions
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e89acdf6b77b..84d32855f65c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1056,15 +1056,6 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif - /* - * In the memory hotplug case, the kernel needs info from SRAT to - * determine which memory is hotpluggable before allocating memory - * using memblock. - */ - acpi_boot_table_init(); - early_acpi_boot_init(); - early_parse_srat(); - #ifdef CONFIG_X86_32 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", (max_pfn_mapped<<PAGE_SHIFT) - 1); @@ -1110,6 +1101,10 @@ void __init setup_arch(char **cmdline_p) /* * Parse the ACPI tables for possible boot-time SMP configuration. */ + acpi_boot_table_init(); + + early_acpi_boot_init(); + initmem_init(); memblock_find_dma_reserve(); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ff3633c794c6..72fe01e9e414 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -212,9 +212,10 @@ static void __init setup_node_data(int nid, u64 start, u64 end) * Allocate node data. Try node-local memory and then any node. * Never allocate in DMA zone. */ - nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); if (!nd_pa) { - pr_err("Cannot find %zu bytes in any node\n", nd_size); + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); return; } nd = __va(nd_pa); @@ -559,12 +560,10 @@ static int __init numa_init(int (*init_func)(void)) for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE); - /* - * Do not clear numa_nodes_parsed or zero numa_meminfo here, because - * SRAT was parsed earlier in early_parse_srat(). - */ + nodes_clear(numa_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); numa_reset_distance(); diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 79836d01f789..cdd0da9dd530 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -141,126 +141,11 @@ static inline int save_add_info(void) {return 1;} static inline int save_add_info(void) {return 0;} #endif -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -static void __init -handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) -{ - int overlap, i; - unsigned long start_pfn, end_pfn; - - start_pfn = PFN_DOWN(start); - end_pfn = PFN_UP(end); - - /* - * For movablemem_map=acpi: - * - * SRAT: |_____| |_____| |_________| |_________| ...... - * node id: 0 1 1 2 - * hotpluggable: n y y n - * movablemem_map: |_____| |_________| - * - * Using movablemem_map, we can prevent memblock from allocating memory - * on ZONE_MOVABLE at boot time. - * - * Before parsing SRAT, memblock has already reserve some memory ranges - * for other purposes, such as for kernel image. We cannot prevent - * kernel from using these memory, so we need to exclude these memory - * even if it is hotpluggable. - * Furthermore, to ensure the kernel has enough memory to boot, we make - * all the memory on the node which the kernel resides in - * un-hotpluggable. - */ - if (hotpluggable && movablemem_map.acpi) { - /* Exclude ranges reserved by memblock. */ - struct memblock_type *rgn = &memblock.reserved; - - for (i = 0; i < rgn->cnt; i++) { - if (end <= rgn->regions[i].base || - start >= rgn->regions[i].base + - rgn->regions[i].size) - continue; - - /* - * If the memory range overlaps the memory reserved by - * memblock, then the kernel resides in this node. - */ - node_set(node, movablemem_map.numa_nodes_kernel); - - goto out; - } - - /* - * If the kernel resides in this node, then the whole node - * should not be hotpluggable. - */ - if (node_isset(node, movablemem_map.numa_nodes_kernel)) - goto out; - - insert_movablemem_map(start_pfn, end_pfn); - - /* - * numa_nodes_hotplug nodemask represents which nodes are put - * into movablemem_map.map[]. - */ - node_set(node, movablemem_map.numa_nodes_hotplug); - goto out; - } - - /* - * For movablemem_map=nn[KMG]@ss[KMG]: - * - * SRAT: |_____| |_____| |_________| |_________| ...... - * node id: 0 1 1 2 - * user specified: |__| |___| - * movablemem_map: |___| |_________| |______| ...... - * - * Using movablemem_map, we can prevent memblock from allocating memory - * on ZONE_MOVABLE at boot time. - * - * NOTE: In this case, SRAT info will be ingored. - */ - overlap = movablemem_map_overlap(start_pfn, end_pfn); - if (overlap >= 0) { - /* - * If part of this range is in movablemem_map, we need to - * add the range after it to extend the range to the end - * of the node, because from the min address specified to - * the end of the node will be ZONE_MOVABLE. - */ - start_pfn = max(start_pfn, - movablemem_map.map[overlap].start_pfn); - insert_movablemem_map(start_pfn, end_pfn); - - /* - * Set the nodemask, so that if the address range on one node - * is not continuse, we can add the subsequent ranges on the - * same node into movablemem_map. - */ - node_set(node, movablemem_map.numa_nodes_hotplug); - } else { - if (node_isset(node, movablemem_map.numa_nodes_hotplug)) - /* - * Insert the range if we already have movable ranges - * on the same node. - */ - insert_movablemem_map(start_pfn, end_pfn); - } -out: - return; -} -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline void -handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) -{ -} -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; - u32 hotpluggable; int node, pxm; if (srat_disabled()) @@ -269,8 +154,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) goto out_err; - hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; - if (hotpluggable && !save_add_info()) + if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) goto out_err; start = ma->base_address; @@ -290,12 +174,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n", + printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, - (unsigned long long) start, (unsigned long long) end - 1, - hotpluggable ? "Hot Pluggable": ""); - - handle_movablemem(node, start, end, hotpluggable); + (unsigned long long) start, (unsigned long long) end - 1); return 0; out_err_bad_srat: |