From 0812a579c92fefa57506821fa08e90f47cb6dbdd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:19 +0100 Subject: [PATCH] x86-64: Add __copy_from_user_nocache This does user copies in fs write() into the page cache with write combining. This pushes the destination out of the CPU's cache, but allows higher bandwidth in some case. The theory is that the page cache data is usually not touched by the CPU again and it's better to not pollute the cache with it. Also it is a little faster. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/x8664_ksyms.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 6d77e4797a47..23a7da312f30 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4); EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); +EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(copy_from_user); EXPORT_SYMBOL(copy_to_user); EXPORT_SYMBOL(__copy_from_user_inatomic); -- cgit v1.2.3 From 076422d2af7e3d8e72c6e70843f6ea377714b082 Mon Sep 17 00:00:00 2001 From: Amul Shah Date: Tue, 13 Feb 2007 13:26:19 +0100 Subject: [PATCH] x86-64: Allocate the NUMA hash function nodemap dynamically Remove the statically allocated memory to NUMA node hash map in favor of a dynamically allocated memory to node hash map (it is cache aligned). This patch has the nice side effect in that it allows the hash map to grow for systems with large amounts of memory (256GB - 1TB), but suffer from having small PCI space tacked onto the boot node (which is somewhere between 192MB to 512MB on the ES7000). Signed-off-by: Amul Shah Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Rohit Seth Signed-off-by: Andrew Morton --- arch/x86_64/kernel/e820.c | 7 +++++ arch/x86_64/kernel/setup.c | 5 +++ arch/x86_64/mm/numa.c | 74 +++++++++++++++++++++++++++++++++++++++------ include/asm-x86_64/e820.h | 1 + include/asm-x86_64/mmzone.h | 13 ++++---- 5 files changed, 85 insertions(+), 15 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 6fe191c58084..9d67955bbc31 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) return 1; } +#ifdef CONFIG_NUMA + /* NUMA memory to node map */ + if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { + *addrp = nodemap_addr + nodemap_size; + return 1; + } +#endif /* XXX ramdisk image here? */ return 0; } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 60477244d1a3..f330f8285499 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -444,6 +444,11 @@ void __init setup_arch(char **cmdline_p) /* reserve ebda region */ if (ebda_addr) reserve_bootmem_generic(ebda_addr, ebda_size); +#ifdef CONFIG_NUMA + /* reserve nodemap region */ + if (nodemap_addr) + reserve_bootmem_generic(nodemap_addr, nodemap_size); +#endif #ifdef CONFIG_SMP /* diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 2ee2e003606c..7d9c428f4094 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; +unsigned long __initdata nodemap_addr; +unsigned long __initdata nodemap_size; /* @@ -52,34 +54,87 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) int res = -1; unsigned long addr, end; - if (shift >= 64) - return -1; - memset(memnodemap, 0xff, sizeof(memnodemap)); + memset(memnodemap, 0xff, memnodemapsize); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; if (addr >= end) continue; - if ((end >> shift) >= NODEMAPSIZE) + if ((end >> shift) >= memnodemapsize) return 0; do { if (memnodemap[addr >> shift] != 0xff) return -1; memnodemap[addr >> shift] = i; - addr += (1UL << shift); + addr += (1UL << shift); } while (addr < end); res = 1; } return res; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +static int __init allocate_cachealigned_memnodemap(void) +{ + unsigned long pad, pad_addr; + + memnodemap = memnode.embedded_map; + if (memnodemapsize <= 48) { + printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", + nodemap_addr, nodemap_addr + nodemap_size); + return 0; + } + + pad = L1_CACHE_BYTES - 1; + pad_addr = 0x8000; + nodemap_size = pad + memnodemapsize; + nodemap_addr = find_e820_area(pad_addr, end_pfn<= 0) - shift++; + for (i = 0; i < numnodes; i++) { + start = nodes[i].start; + end = nodes[i].end; + if (start >= end) + continue; + bitfield |= start | end; + if (end > memtop) + memtop = end; + } + i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + memnodemapsize = (memtop >> i)+1; + return i; +} + +int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +{ + int shift; + shift = extract_lsb_from_nodes(nodes, numnodes); + if (allocate_cachealigned_memnodemap()) + return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); @@ -290,6 +345,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) end_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ memnode_shift = 63; + memnodemap = memnode.embedded_map; memnodemap[0] = 0; nodes_clear(node_online_map); node_set_online(0); diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h index fa2086774105..855fb4a454b6 100644 --- a/include/asm-x86_64/e820.h +++ b/include/asm-x86_64/e820.h @@ -56,6 +56,7 @@ extern void finish_e820_parsing(void); extern struct e820map e820; extern unsigned ebda_addr, ebda_size; +extern unsigned long nodemap_addr, nodemap_size; #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index c38ebdf6f426..39ef106986eb 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -11,24 +11,25 @@ #include -/* Should really switch to dynamic allocation at some point */ -#define NODEMAPSIZE 0x4fff - /* Simple perfect hash to map physical addresses to node numbers */ struct memnode { int shift; - u8 map[NODEMAPSIZE]; -} ____cacheline_aligned; + unsigned int mapsize; + u8 *map; + u8 embedded_map[64-16]; +} ____cacheline_aligned; /* total size = 64 bytes */ extern struct memnode memnode; #define memnode_shift memnode.shift #define memnodemap memnode.map +#define memnodemapsize memnode.mapsize extern struct pglist_data *node_data[]; static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) { unsigned nid; - VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE); + VIRTUAL_BUG_ON(!memnodemap); + VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize); nid = memnodemap[addr >> memnode_shift]; VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); return nid; -- cgit v1.2.3 From 5558870bfbcca10cfc7b13ab866687012ea3c9af Mon Sep 17 00:00:00 2001 From: Karsten Weiss Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH] x86-64: improved iommu documentation - add SWIOTLB config help text - mention Documentation/x86_64/boot-options.txt in Documentation/kernel-parameters.txt - remove the duplication of the iommu kernel parameter documentation. - Better explanation of some of the iommu kernel parameter options. - "32MB< Signed-off-by: Andi Kleen Acked-by: Muli Ben-Yehuda Cc: Andi Kleen Signed-off-by: Andrew Morton --- Documentation/kernel-parameters.txt | 3 + Documentation/x86_64/boot-options.txt | 109 +++++++++++++++++++++++----------- arch/x86_64/Kconfig | 10 +++- arch/x86_64/kernel/pci-dma.c | 28 ++------- 4 files changed, 90 insertions(+), 60 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index d25acd51e181..733a736bc6c8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -104,6 +104,9 @@ loader, and have no meaning to the kernel directly. Do not modify the syntax of boot loader parameters without extreme need or coordination with . +There are also arch-specific kernel-parameters not documented here. +See for example . + Note that ALL kernel parameters listed below are CASE SENSITIVE, and that a trailing = on the name of any parameter states that that parameter will be entered as an environment variable, whereas its absence indicates that diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 5c86ed6f0448..0d653993f361 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -180,40 +180,81 @@ PCI pci=lastbus=NUMBER Scan upto NUMBER busses, no matter what the mptable says. pci=noacpi Don't use ACPI to set up PCI interrupt routing. -IOMMU - - iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] - [,forcesac][,fullflush][,nomerge][,noaperture][,calgary] - size set size of iommu (in bytes) - noagp don't initialize the AGP driver and use full aperture. - off don't use the IOMMU - leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) - memaper[=order] allocate an own aperture over RAM with size 32MB^order. - noforce don't force IOMMU usage. Default. - force Force IOMMU. - merge Do SG merging. Implies force (experimental) - nomerge Don't do SG merging. - forcesac For SAC mode for masks <40bits (experimental) - fullflush Flush IOMMU on each allocation (default) - nofullflush Don't use IOMMU fullflush - allowed overwrite iommu off workarounds for specific chipsets. - soft Use software bounce buffering (default for Intel machines) - noaperture Don't touch the aperture for AGP. - allowdac Allow DMA >4GB - When off all DMA over >4GB is forced through an IOMMU or bounce - buffering. - nodac Forbid DMA >4GB - panic Always panic when IOMMU overflows - calgary Use the Calgary IOMMU if it is available - - swiotlb=pages[,force] - - pages Prereserve that many 128K pages for the software IO bounce buffering. - force Force all IO through the software TLB. - - calgary=[64k,128k,256k,512k,1M,2M,4M,8M] - calgary=[translate_empty_slots] - calgary=[disable=] +IOMMU (input/output memory management unit) + + Currently four x86-64 PCI-DMA mapping implementations exist: + + 1. : use no hardware/software IOMMU at all + (e.g. because you have < 3 GB memory). + Kernel boot message: "PCI-DMA: Disabling IOMMU" + + 2. : AMD GART based hardware IOMMU. + Kernel boot message: "PCI-DMA: using GART IOMMU" + + 3. : Software IOMMU implementation. Used + e.g. if there is no hardware IOMMU in the system and it is need because + you have >3GB memory or told the kernel to us it (iommu=soft)) + Kernel boot message: "PCI-DMA: Using software bounce buffering + for IO (SWIOTLB)" + + 4. : IBM Calgary hardware IOMMU. Used in IBM + pSeries and xSeries servers. This hardware IOMMU supports DMA address + mapping with memory protection, etc. + Kernel boot message: "PCI-DMA: Using Calgary IOMMU" + + iommu=[][,noagp][,off][,force][,noforce][,leak[=] + [,memaper[=]][,merge][,forcesac][,fullflush][,nomerge] + [,noaperture][,calgary] + + General iommu options: + off Don't initialize and use any kind of IOMMU. + noforce Don't force hardware IOMMU usage when it is not needed. + (default). + force Force the use of the hardware IOMMU even when it is + not actually needed (e.g. because < 3 GB memory). + soft Use software bounce buffering (SWIOTLB) (default for + Intel machines). This can be used to prevent the usage + of an available hardware IOMMU. + + iommu options only relevant to the AMD GART hardware IOMMU: + Set the size of the remapping area in bytes. + allowed Overwrite iommu off workarounds for specific chipsets. + fullflush Flush IOMMU on each allocation (default). + nofullflush Don't use IOMMU fullflush. + leak Turn on simple iommu leak tracing (only when + CONFIG_IOMMU_LEAK is on). Default number of leak pages + is 20. + memaper[=] Allocate an own aperture over RAM with size 32MB<4GB. + DAC is used with 32-bit PCI to push a 64-bit address in + two cycles. When off all DMA over >4GB is forced through + an IOMMU or software bounce buffering. + nodac Forbid DAC mode, i.e. DMA >4GB. + panic Always panic when IOMMU overflows. + calgary Use the Calgary IOMMU if it is available + + iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU + implementation: + swiotlb=[,force] + Prereserve that many 128K pages for the software IO + bounce buffering. + force Force all IO through the software TLB. + + Settings for the IBM Calgary hardware IOMMU currently found in IBM + pSeries and xSeries machines: + + calgary=[64k,128k,256k,512k,1M,2M,4M,8M] + calgary=[translate_empty_slots] + calgary=[disable=] + panic Always panic when IOMMU overflows 64k,...,8M - Set the size of each PCI slot's translation table when using the Calgary IOMMU. This is the size of the translation diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 02dd39457bcf..a55382a1bb46 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -458,8 +458,8 @@ config IOMMU on systems with more than 3GB. This is usually needed for USB, sound, many IDE/SATA chipsets and some other devices. Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART - based IOMMU and a software bounce buffer based IOMMU used on Intel - systems and as fallback. + based hardware IOMMU and a software bounce buffer based IOMMU used + on Intel systems and as fallback. The code is only active when needed (enough memory and limited device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified too. @@ -496,6 +496,12 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT # need this always selected by IOMMU for the VIA workaround config SWIOTLB bool + help + Support for software bounce buffers used on x86-64 systems + which don't have a hardware IOMMU (e.g. the current generation + of Intel's x86-64 CPUs). Using this PCI devices which can only + access 32-bits of memory can be used on systems with more than + 3 GB of memory. If unsure, say Y. config X86_MCE bool "Machine check support" if EMBEDDED diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 683b7a5c1ab3..651ccfb06697 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -223,30 +223,10 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] - [,forcesac][,fullflush][,nomerge][,biomerge] - size set size of iommu (in bytes) - noagp don't initialize the AGP driver and use full aperture. - off don't use the IOMMU - leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) - memaper[=order] allocate an own aperture over RAM with size 32MB^order. - noforce don't force IOMMU usage. Default. - force Force IOMMU. - merge Do lazy merging. This may improve performance on some block devices. - Implies force (experimental) - biomerge Do merging at the BIO layer. This is more efficient than merge, - but should be only done with very big IOMMUs. Implies merge,force. - nomerge Don't do SG merging. - forcesac For SAC mode for masks <40bits (experimental) - fullflush Flush IOMMU on each allocation (default) - nofullflush Don't use IOMMU fullflush - allowed overwrite iommu off workarounds for specific chipsets. - soft Use software bounce buffering (default for Intel machines) - noaperture Don't touch the aperture for AGP. - allowdac Allow DMA >4GB - nodac Forbid DMA >4GB - panic Force panic when IOMMU overflows -*/ +/* + * See for the iommu kernel parameter + * documentation. + */ __init int iommu_setup(char *p) { iommu_merge = 1; -- cgit v1.2.3 From 006e84ee3a54e393ec6bef2a9bc891dc5bde2843 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH] x86-64: do not always end the stack trace with ULONG_MAX It makes more sense to end the stack trace with ULONG_MAX only if nr_entries < max_entries. Otherwise, we lose one entry in the long stack traces and cannot know whether the trace was complete or not. Signed-off-by: Catalin Marinas Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jan Beulich Signed-off-by: Andrew Morton --- arch/x86_64/kernel/stacktrace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 6026b31d037e..65ac2c6b34a6 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -32,7 +32,7 @@ static void save_stack_address(void *data, unsigned long addr) trace->skip--; return; } - if (trace->nr_entries < trace->max_entries - 1) + if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = addr; } @@ -49,7 +49,8 @@ static struct stacktrace_ops save_stack_ops = { void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { dump_trace(task, NULL, NULL, &save_stack_ops, trace); - trace->entries[trace->nr_entries++] = ULONG_MAX; + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL(save_stack_trace); -- cgit v1.2.3 From 53fee04f318222a3179ca5933d8bda82c1eef17a Mon Sep 17 00:00:00 2001 From: Rohit Seth Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH] x86-64: Fix fake numa for x86_64 machines with big IO hole This patch resolves the issue of running with numa=fake=X on kernel command line on x86_64 machines that have big IO hole. While calculating the size of each node now we look at the total hole size in that range. Previously there were nodes that only had IO holes in them causing kernel boot problems. We now use the NODE_MIN_SIZE (64MB) as the minimum size of memory that any node must have. We reduce the number of allocated nodes if the number of nodes specified on kernel command line results in any node getting memory smaller than NODE_MIN_SIZE. This change allows the extra memory to be incremented in NODE_MIN_SIZE granule and uniformly distribute among as many nodes (called big nodes) as possible. [akpm@osdl.org: build fix] Signed-off-by: David Rientjes Signed-off-by: Paul Menage Signed-off-by: Rohit Seth Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/e820.c | 31 +++++++++++++ arch/x86_64/mm/numa.c | 110 ++++++++++++++++++++++++++++++++++++++------ include/asm-x86_64/e820.h | 1 + include/asm-x86_64/mmzone.h | 5 ++ 4 files changed, 133 insertions(+), 14 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 9d67955bbc31..4651fd22b213 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -190,6 +190,37 @@ unsigned long __init e820_end_of_ram(void) return end_pfn; } +/* + * Find the hole size in the range. + */ +unsigned long __init e820_hole_size(unsigned long start, unsigned long end) +{ + unsigned long ram = 0; + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long last, addr; + + if (ei->type != E820_RAM || + ei->addr+ei->size <= start || + ei->addr >= end) + continue; + + addr = round_up(ei->addr, PAGE_SIZE); + if (addr < start) + addr = start; + + last = round_down(ei->addr + ei->size, PAGE_SIZE); + if (last >= end) + last = end; + + if (last > addr) + ram += last - addr; + } + return ((end - start) - ram); +} + /* * Mark e820 reserved areas as busy for the resource manager. */ diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 1ec16ea97519..d3f747dd61d3 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -272,31 +272,113 @@ void __init numa_init_array(void) } #ifdef CONFIG_NUMA_EMU +/* Numa emulation */ int numa_fake __initdata = 0; -/* Numa emulation */ +/* + * This function is used to find out if the start and end correspond to + * different zones. + */ +int zone_cross_over(unsigned long start, unsigned long end) +{ + if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && + (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) + return 1; + return 0; +} + static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { - int i; + int i, big; struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz = ((end_pfn - start_pfn)< 1) { - unsigned long x = 1; - while ((x << 1) < sz) - x <<= 1; - if (x < sz/2) - printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); - sz = x; - } + old_sz = sz; + /* + * Round down to the nearest FAKE_NODE_MIN_SIZE. + */ + sz &= FAKE_NODE_MIN_HASH_MASK; + + /* + * We ensure that each node is at least 64MB big. Smaller than this + * size can cause VM hiccups. + */ + if (sz == 0) { + printk(KERN_INFO "Not enough memory for %d nodes. Reducing " + "the number of nodes\n", numa_fake); + numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Number of fake nodes will be = %d\n", + numa_fake); + sz = FAKE_NODE_MIN_SIZE; + } + /* + * Find out how many nodes can get an extra NODE_MIN_SIZE granule. + * This logic ensures the extra memory gets distributed among as many + * nodes as possible (as compared to one single node getting all that + * extra memory. + */ + big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " + "%d\n", + (sz >> 20), (hole_size >> 20), big); memset(&nodes,0,sizeof(nodes)); + end = start; for (i = 0; i < numa_fake; i++) { - nodes[i].start = (start_pfn<= max_addr) { + numa_fake = i - 1; + break; + } + start = nodes[i].start = end; + /* + * Final node can have all the remaining memory. + */ if (i == numa_fake-1) - sz = (end_pfn<= max_addr) + break; + } + /* + * Look at the next node to make sure there is some real memory + * to map. Bad things happen when the only memory present + * in a zone on a fake node is IO hole. + */ + while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { + if (zone_cross_over(start, end + sz)) { + end = (MAX_DMA32_PFN << PAGE_SHIFT); + break; + } + if (end >= max_addr) + break; + end += FAKE_NODE_MIN_SIZE; + } + if (end > max_addr) + end = max_addr; + nodes[i].end = end; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", i, nodes[i].start, nodes[i].end, diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h index 855fb4a454b6..6216fa3f2802 100644 --- a/include/asm-x86_64/e820.h +++ b/include/asm-x86_64/e820.h @@ -46,6 +46,7 @@ extern void e820_mark_nosave_regions(void); extern void e820_print_map(char *who); extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type); extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); +extern unsigned long e820_hole_size(unsigned long start, unsigned long end); extern void e820_setup_gap(void); extern void e820_register_active_regions(int nid, diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index 39ef106986eb..fb558fb1d211 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -47,5 +47,10 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) extern int pfn_valid(unsigned long pfn); #endif +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE (64*1024*1024) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul)) +#endif + #endif #endif -- cgit v1.2.3 From c49c5330c9592f29a69bb2ea8f6e7fd5d9c151e8 Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH] x86-64: Remove fastcall references in x86_64 code Unlike x86, x86_64 already passes arguments in registers. The use of regparm attribute makes no difference in produced code, and the use of fastcall just bloats the code. Signed-off-by: Glauber de Oliveira Costa Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/acpi/sleep.c | 2 +- arch/x86_64/kernel/x8664_ksyms.c | 4 ++-- include/asm-x86_64/hw_irq.h | 2 +- include/asm-x86_64/mutex.h | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 5ebf62c7a3d2..23178ce6c783 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -58,7 +58,7 @@ unsigned long acpi_wakeup_address = 0; unsigned long acpi_video_flags; extern char wakeup_start, wakeup_end; -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); +extern unsigned long acpi_copy_wakeup_routine(unsigned long); static pgd_t low_ptr; diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 23a7da312f30..0dffae69f4ad 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -35,8 +35,8 @@ EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); #ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); +extern void __write_lock_failed(rwlock_t *rw); +extern void __read_lock_failed(rwlock_t *rw); EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); #endif diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h index 179cce755aa7..552df5f10a6d 100644 --- a/include/asm-x86_64/hw_irq.h +++ b/include/asm-x86_64/hw_irq.h @@ -91,7 +91,7 @@ extern void enable_8259A_irq(unsigned int irq); extern int i8259A_irq_pending(unsigned int irq); extern void make_8259A_irq(unsigned int irq); extern void init_8259A(int aeoi); -extern void FASTCALL(send_IPI_self(int vector)); +extern void send_IPI_self(int vector); extern void init_VISWS_APIC_irqs(void); extern void setup_IO_APIC(void); extern void disable_IO_APIC(void); diff --git a/include/asm-x86_64/mutex.h b/include/asm-x86_64/mutex.h index 16396b1de3e4..6c2949a3c677 100644 --- a/include/asm-x86_64/mutex.h +++ b/include/asm-x86_64/mutex.h @@ -21,7 +21,7 @@ do { \ unsigned long dummy; \ \ typecheck(atomic_t *, v); \ - typecheck_fn(fastcall void (*)(atomic_t *), fail_fn); \ + typecheck_fn(void (*)(atomic_t *), fail_fn); \ \ __asm__ __volatile__( \ LOCK_PREFIX " decl (%%rdi) \n" \ @@ -47,7 +47,7 @@ do { \ */ static inline int __mutex_fastpath_lock_retval(atomic_t *count, - int fastcall (*fail_fn)(atomic_t *)) + int (*fail_fn)(atomic_t *)) { if (unlikely(atomic_dec_return(count) < 0)) return fail_fn(count); @@ -67,7 +67,7 @@ do { \ unsigned long dummy; \ \ typecheck(atomic_t *, v); \ - typecheck_fn(fastcall void (*)(atomic_t *), fail_fn); \ + typecheck_fn(void (*)(atomic_t *), fail_fn); \ \ __asm__ __volatile__( \ LOCK_PREFIX " incl (%%rdi) \n" \ -- cgit v1.2.3 From 4c3cbf75b262433afc90b5c35510d1e5744d3b94 Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH] x86-64: Use constant instead of raw number in x86_64 ioperm.c This is a tiny cleanup to increase readability Signed-off-by: Glauber de Oliveira Costa Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/ioport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index fe063d3cfe42..745b1f0f494e 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -114,6 +114,6 @@ asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) if (!capable(CAP_SYS_RAWIO)) return -EPERM; } - regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); + regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); return 0; } -- cgit v1.2.3 From 1676193937a538fdb92a2916a86a705093cfd613 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH] x86-64: Handle 32 bit PerfMon Counter writes cleanly in x86_64 nmi_watchdog P6 CPUs and Core/Core 2 CPUs which has 'architectural perf mon' feature, only supports write of low 32 bits in Performance Monitoring Counters. Bits 32..39 are sign extended based on bit 31 and bits 40..63 are reserved and should be zero. This patch: Change x86_64 nmi handler to handle this case cleanly. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 9cb42ecb7f89..e59cda134166 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -214,6 +214,23 @@ static __init void nmi_cpu_busy(void *data) } #endif +static unsigned int adjust_for_32bit_ctr(unsigned int hz) +{ + unsigned int retval = hz; + + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) { + retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; + } + return retval; +} + int __init check_nmi_watchdog (void) { int *counts; @@ -268,17 +285,8 @@ int __init check_nmi_watchdog (void) struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); nmi_hz = 1; - /* - * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && - ((u64)cpu_khz * 1000) > 0x7fffffffULL) { - nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; - } + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) + nmi_hz = adjust_for_32bit_ctr(nmi_hz); } kfree(counts); @@ -634,7 +642,9 @@ static int setup_intel_arch_watchdog(void) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -855,15 +865,23 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) dummy &= ~P4_CCCR_OVF; wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, + -((u64)cpu_khz * 1000 / nmi_hz)); } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { /* * ArchPerfom/Core Duo needs to re-unmask * the apic vector */ apic_write(APIC_LVTPC, APIC_DM_NMI); + /* ARCH_PERFMON has 32 bit counter writes */ + wrmsr(wd->perfctr_msr, + (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); + } else { + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, + -((u64)cpu_khz * 1000 / nmi_hz)); } - /* start the cycle over again */ - wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); rc = 1; } else if (nmi_watchdog == NMI_IO_APIC) { /* don't know how to accurately check for this. -- cgit v1.2.3 From 24ce0e96f2dea558762c994d054ea2f3c01fa95a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH] x86-64: Tighten mce_amd driver MSR reads while debugging an unrelated problem in Xen, I noticed odd reads from non-existent MSRs. Having now found time to look why these happen, I came up with below patch, which - prevents accessing MCi_MISCj with j > 0 when the block pointer in MCi_MISC0 is zero - accesses only contiguous MCi_MISCj until a non-implemented one is found - doesn't touch unimplemented blocks in mce_threshold_interrupt at all - gives names to two bits previously derived from MASK_VALID_HI (it took me some time to understand the code without this) The first three items, besides being apparently closer to the spec, should namely help cutting down on the time mce_threshold_interrupt() takes. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mce_amd.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 93c707257637..cd8dbe57b33a 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -37,6 +37,8 @@ #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 #define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 #define MASK_LVTOFF_HI 0x00F00000 #define MASK_COUNT_EN_HI 0x00080000 #define MASK_INT_TYPE_HI 0x00060000 @@ -122,14 +124,17 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) - address = MCG_XBLK_ADDR - + ((low & MASK_BLKPTR_LO) >> 21); + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else ++address; if (rdmsr_safe(address, &low, &high)) - continue; + break; if (!(high & MASK_VALID_HI)) { if (block) @@ -138,8 +143,8 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) break; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) continue; if (!block) @@ -187,17 +192,22 @@ asmlinkage void mce_threshold_interrupt(void) /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + continue; for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) - address = MCG_XBLK_ADDR - + ((low & MASK_BLKPTR_LO) >> 21); + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else ++address; if (rdmsr_safe(address, &low, &high)) - continue; + break; if (!(high & MASK_VALID_HI)) { if (block) @@ -206,8 +216,8 @@ asmlinkage void mce_threshold_interrupt(void) break; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) continue; if (high & MASK_OVERFLOW_HI) { @@ -385,7 +395,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, return 0; if (rdmsr_safe(address, &low, &high)) - goto recurse; + return 0; if (!(high & MASK_VALID_HI)) { if (block) @@ -394,8 +404,8 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, return 0; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) goto recurse; b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); -- cgit v1.2.3 From a98f0dd34d94ea0b5f3816196bea5dba467827bb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH] x86-64: Allow to run a program when a machine check event is detected When a machine check event is detected (including a AMD RevF threshold overflow event) allow to run a "trigger" program. This allows user space to react to such events sooner. The trigger is configured using a new trigger entry in the machinecheck sysfs interface. It is currently shared between all CPUs. I also fixed the AMD threshold handler to run the machine check polling code immediately to actually log any events that might have caused the threshold interrupt. Also added some documentation for the mce sysfs interface. Signed-off-by: Andi Kleen --- Documentation/x86_64/machinecheck | 70 +++++++++++++++++++++++++++++++++++++++ arch/x86_64/kernel/mce.c | 66 +++++++++++++++++++++++++++++------- arch/x86_64/kernel/mce_amd.c | 4 +++ include/asm-x86_64/mce.h | 2 ++ kernel/kmod.c | 44 ++++++++++++++++-------- 5 files changed, 160 insertions(+), 26 deletions(-) create mode 100644 Documentation/x86_64/machinecheck (limited to 'arch/x86_64/kernel') diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck new file mode 100644 index 000000000000..068a6d9904b9 --- /dev/null +++ b/Documentation/x86_64/machinecheck @@ -0,0 +1,70 @@ + +Configurable sysfs parameters for the x86-64 machine check code. + +Machine checks report internal hardware error conditions detected +by the CPU. Uncorrected errors typically cause a machine check +(often with panic), corrected ones cause a machine check log entry. + +Machine checks are organized in banks (normally associated with +a hardware subsystem) and subevents in a bank. The exact meaning +of the banks and subevent is CPU specific. + +mcelog knows how to decode them. + +When you see the "Machine check errors logged" message in the system +log then mcelog should run to collect and decode machine check entries +from /dev/mcelog. Normally mcelog should be run regularly from a cronjob. + +Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN +(N = CPU number) + +The directory contains some configurable entries: + +Entries: + +bankNctl +(N bank number) + 64bit Hex bitmask enabling/disabling specific subevents for bank N + When a bit in the bitmask is zero then the respective + subevent will not be reported. + By default all events are enabled. + Note that BIOS maintain another mask to disable specific events + per bank. This is not visible here + +The following entries appear for each CPU, but they are truly shared +between all CPUs. + +check_interval + How often to poll for corrected machine check errors, in seconds + (Note output is hexademical). Default 5 minutes. + +tolerant + Tolerance level. When a machine check exception occurs for a non + corrected machine check the kernel can take different actions. + Since machine check exceptions can happen any time it is sometimes + risky for the kernel to kill a process because it defies + normal kernel locking rules. The tolerance level configures + how hard the kernel tries to recover even at some risk of deadlock. + + 0: always panic, + 1: panic if deadlock possible, + 2: try to avoid panic, + 3: never panic or exit (for testing only) + + Default: 1 + + Note this only makes a difference if the CPU allows recovery + from a machine check exception. Current x86 CPUs generally do not. + +trigger + Program to run when a machine check event is detected. + This is an alternative to running mcelog regularly from cron + and allows to detect events faster. + +TBD document entries for AMD threshold interrupt configuration + +For more details about the x86 machine check architecture +see the Intel and AMD architecture manuals from their developer websites. + +For more details about the architecture see +see http://one.firstfloor.org/~andi/mce.pdf diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index bdb54a2c9f18..8011a8e1c7d4 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,10 @@ static unsigned long console_logged; static int notify_user; static int rip_msr; static int mce_bootlog = 1; +static atomic_t mce_events; + +static char trigger[128]; +static char *trigger_argv[2] = { trigger, NULL }; /* * Lockless MCE logging infrastructure. @@ -57,6 +62,7 @@ struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { @@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } +static void do_mce_trigger(void) +{ + static atomic_t mce_logged; + int events = atomic_read(&mce_events); + if (events != atomic_read(&mce_logged) && trigger[0]) { + /* Small race window, but should be harmless. */ + atomic_set(&mce_logged, events); + call_usermodehelper(trigger, trigger_argv, NULL, -1); + } +} + /* * The actual machine check handler */ @@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) } /* Never do anything final in the polling timer */ - if (!regs) + if (!regs) { + /* Normal interrupt context here. Call trigger for any new + events. */ + do_mce_trigger(); goto out; + } /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ @@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce); } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); +/* TBD should generate these dynamically based on number of available banks */ ACCESSOR(bank0ctl,bank[0],mce_restart()) ACCESSOR(bank1ctl,bank[1],mce_restart()) ACCESSOR(bank2ctl,bank[2],mce_restart()) ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(bank5ctl,bank[5],mce_restart()) -static struct sysdev_attribute * bank_attributes[NR_BANKS] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl}; + +static ssize_t show_trigger(struct sys_device *s, char *buf) +{ + strcpy(buf, trigger); + strcat(buf, "\n"); + return strlen(trigger) + 1; +} + +static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) +{ + char *p; + int len; + strncpy(trigger, buf, sizeof(trigger)); + trigger[sizeof(trigger)-1] = 0; + len = strlen(trigger); + p = strchr(trigger, '\n'); + if (*p) *p = 0; + return len; +} + +static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) +static struct sysdev_attribute *mce_attributes[] = { + &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, + &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, + &attr_tolerant, &attr_check_interval, &attr_trigger, + NULL +}; /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ static __cpuinit int mce_create_device(unsigned int cpu) @@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu) err = sysdev_register(&per_cpu(device_mce,cpu)); if (!err) { - for (i = 0; i < banks; i++) + for (i = 0; mce_attributes[i]; i++) sysdev_create_file(&per_cpu(device_mce,cpu), - bank_attributes[i]); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); + mce_attributes[i]); } return err; } @@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu) { int i; - for (i = 0; i < banks; i++) + for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), - bank_attributes[i]); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); + mce_attributes[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); } diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index cd8dbe57b33a..d0bd5d66e103 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -220,6 +220,10 @@ asmlinkage void mce_threshold_interrupt(void) (high & MASK_LOCKED_HI)) continue; + /* Log the machine check that caused the threshold + event. */ + do_machine_check(NULL, 0); + if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h index 5a11146d6d9c..177e92b4019b 100644 --- a/include/asm-x86_64/mce.h +++ b/include/asm-x86_64/mce.h @@ -103,6 +103,8 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status); extern atomic_t mce_entry; +extern void do_machine_check(struct pt_regs *, long); + #endif #endif diff --git a/kernel/kmod.c b/kernel/kmod.c index 3a7379aa31ca..796276141e51 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -217,7 +217,10 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - complete(sub_info->complete); + if (sub_info->wait < 0) + kfree(sub_info); + else + complete(sub_info->complete); return 0; } @@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work) pid = kernel_thread(____call_usermodehelper, sub_info, CLONE_VFORK | SIGCHLD); + if (wait < 0) + return; + if (pid < 0) { sub_info->retval = pid; complete(sub_info->complete); @@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work) * @envp: null-terminated environment list * @session_keyring: session keyring for process (NULL for an empty keyring) * @wait: wait for the application to finish and return status. + * when -1 don't wait at all, but you get no useful error back when + * the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of keventd. @@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, struct key *session_keyring, int wait) { DECLARE_COMPLETION_ONSTACK(done); - struct subprocess_info sub_info = { - .work = __WORK_INITIALIZER(sub_info.work, - __call_usermodehelper), - .complete = &done, - .path = path, - .argv = argv, - .envp = envp, - .ring = session_keyring, - .wait = wait, - .retval = 0, - }; + struct subprocess_info *sub_info; + int retval; if (!khelper_wq) return -EBUSY; @@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, if (path[0] == '\0') return 0; - queue_work(khelper_wq, &sub_info.work); + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); + if (!sub_info) + return -ENOMEM; + + INIT_WORK(&sub_info->work, __call_usermodehelper); + sub_info->complete = &done; + sub_info->path = path; + sub_info->argv = argv; + sub_info->envp = envp; + sub_info->ring = session_keyring; + sub_info->wait = wait; + + queue_work(khelper_wq, &sub_info->work); + if (wait < 0) /* task has freed sub_info */ + return 0; wait_for_completion(&done); - return sub_info.retval; + retval = sub_info->retval; + kfree(sub_info); + return retval; } EXPORT_SYMBOL(call_usermodehelper_keys); -- cgit v1.2.3 From 2f7a2a79c3ebb44f8b1b7d9b4fd3a650eb69e544 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH] x86-64: - Ignore long SMI interrupts in clock calibration code - update 1 Add failsafe mechanism to HPET/TSC clock calibration. Signed-off-by: Jack Steiner Updated to include failsafe mechanism & additional community feedback. Patch built on latest 2.6.20-rc4-mm1 tree. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 335cc91c49b7..9c7fba30dac1 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -657,6 +657,7 @@ core_initcall(cpufreq_tsc); #define TICK_COUNT 100000000 #define TICK_MIN 5000 +#define MAX_READ_RETRIES 5 /* * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none @@ -664,13 +665,17 @@ core_initcall(cpufreq_tsc); */ static void __init read_hpet_tsc(int *hpet, int *tsc) { - int tsc1, tsc2, hpet1; + int tsc1, tsc2, hpet1, retries = 0; + static int msg; do { tsc1 = get_cycles_sync(); hpet1 = hpet_readl(HPET_COUNTER); tsc2 = get_cycles_sync(); - } while (tsc2 - tsc1 > TICK_MIN); + } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES); + if (retries >= MAX_READ_RETRIES && !msg++) + printk(KERN_WARNING + "hpet.c: exceeded max retries to read HPET & TSC\n"); *hpet = hpet1; *tsc = tsc2; } -- cgit v1.2.3 From f49481bc50fce428521497977861b8115666dbe7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH] x86-64: Check return value of putreg in PTRACE_SETREGS This means if an illegal value is set for the segment registers there ptrace will error out now with an errno instead of silently ignoring it. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/ptrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index addc14af0c56..4326a690a509 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -536,8 +536,12 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) } ret = 0; for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret |= __get_user(tmp, (unsigned long __user *) data); - putreg(child, ui, tmp); + ret = __get_user(tmp, (unsigned long __user *) data); + if (ret) + break; + ret = putreg(child, ui, tmp); + if (ret) + break; data += sizeof(long); } break; -- cgit v1.2.3 From 9a11ff68273f440b1d33fcc4d550ffc881e6a0b4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH] x86-64: Unexport __supported_pte_mask The symbol is needed to manipulate page tables, and modules shouldn't do that. Leftover from 2.4, but no in tree module should need it now. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 8c4b80fe71a1..6a70b55f719d 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -37,7 +37,6 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL(__supported_pte_mask); static int do_not_nx __cpuinitdata = 0; /* noexec=on|off -- cgit v1.2.3 From ffb6017563aa15f9a8cff9a30b861d42c2695894 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH] x86-64: x86_64 - Fix FS/GS registers for VT execution Initialize FS and GS to __KERNEL_DS as well. The actual value of them is not important, but it is important to reload them in protected mode. At this time, they still retain the real mode values from initial boot. VT disallows execution of code under such conditions, which means hardware virtualization can not be used to boot the kernel on Intel platforms, making the boot time painfully slow. This requires moving the GS load before the load of GS_BASE, so just move all the segments loads there to keep them together in the code. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 1e6f80870679..598a4d0351fc 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -163,6 +163,20 @@ startup_64: */ lgdt cpu_gdt_descr + /* set up data segments. actually 0 would do too */ + movl $__KERNEL_DS,%eax + movl %eax,%ds + movl %eax,%ss + movl %eax,%es + + /* + * We don't really need to load %fs or %gs, but load them anyway + * to kill any stale realmode selectors. This allows execution + * under VT hardware. + */ + movl %eax,%fs + movl %eax,%gs + /* * Setup up a dummy PDA. this is just for some early bootup code * that does in_interrupt() @@ -173,12 +187,6 @@ startup_64: shrq $32,%rdx wrmsr - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - movl %eax,%ss - movl %eax,%es - /* esi is pointer to real mode structure with interesting info. pass it to C */ movl %esi, %edi -- cgit v1.2.3 From 00edefae050c2c2d1e26fa9984f8f529fbc45989 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH] x86-64: Fix off by one error in IOMMU boundary checking Should be harmless because there is normally no memory there, but technically it was incorrect. Pointed out by Leo Duran Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-gart.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index fc1960f1f243..030eb3753358 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -185,7 +185,7 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { u64 mask = *dev->dma_mask; - int high = addr + size >= mask; + int high = addr + size > mask; int mmu = high; if (force_iommu) mmu = 1; @@ -195,7 +195,7 @@ static inline int need_iommu(struct device *dev, unsigned long addr, size_t size static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { u64 mask = *dev->dma_mask; - int high = addr + size >= mask; + int high = addr + size > mask; int mmu = high; return mmu; } -- cgit v1.2.3 From fc986db4fc1e773e240a19bc8b407ead88982cea Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH] x86-64: Don't reserve ROMs We trust the e820 table, so explicitely reserving ROMs shouldn't be needed. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 130 +-------------------------------------------- 1 file changed, 2 insertions(+), 128 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index f330f8285499..e82c1a155af5 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -138,128 +138,6 @@ struct resource code_resource = { .flags = IORESOURCE_RAM, }; -#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) - -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource adapter_rom_resources[] = { - { .name = "Adapter ROM", .start = 0xc8000, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM } -}; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_ROM, -}; - -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_RAM, -}; - -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) - -static int __init romchecksum(unsigned char *rom, unsigned long length) -{ - unsigned char *p, sum = 0; - - for (p = rom; p < rom + length; p++) - sum += *p; - return sum == 0; -} - -static void __init probe_roms(void) -{ - unsigned long start, length, upper; - unsigned char *rom; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; - start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed @@ -524,15 +402,11 @@ void __init setup_arch(char **cmdline_p) init_apic_mappings(); /* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ - probe_roms(); + * We trust e820 completely. No explicit ROM probing in memory. + */ e820_reserve_resources(); e820_mark_nosave_regions(); - request_resource(&iomem_resource, &video_ram_resource); - { unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ -- cgit v1.2.3 From 310adfdd9153f6ae818981a38a48dd2330990d8d Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH] x86-64: robustify bad_dma_address handling - set bad_dma_address explicitly to 0x0 - reserve 32 pages from bad_dma_address and up - WARN_ON() a driver feeding us bad_dma_address Thanks to Leo Duran for the suggestion. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Andi Kleen Cc: Leo Duran Cc: Job Mason --- arch/x86_64/kernel/pci-calgary.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 3d65b1d4c2b3..04480c3b68f5 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -138,6 +138,8 @@ static const unsigned long phb_debug_offsets[] = { #define PHB_DEBUG_STUFF_OFFSET 0x0020 +#define EMERGENCY_PAGES 32 /* = 128KB */ + unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; @@ -296,6 +298,16 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, { unsigned long entry; unsigned long badbit; + unsigned long badend; + + /* were we called with bad_dma_address? */ + badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " + "address 0x%Lx\n", dma_addr); + WARN_ON(1); + return; + } entry = dma_addr >> PAGE_SHIFT; @@ -656,8 +668,8 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) u64 start; struct iommu_table *tbl = dev->sysdata; - /* reserve bad_dma_address in case it's a legal address */ - iommu_range_reserve(tbl, bad_dma_address, 1); + /* reserve EMERGENCY_PAGES from bad_dma_address and up */ + iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ start = (640 * 1024); @@ -1176,6 +1188,7 @@ int __init calgary_iommu_init(void) } force_iommu = 1; + bad_dma_address = 0x0; dma_ops = &calgary_dma_ops; return 0; -- cgit v1.2.3 From 5d0e600d903caa09e790824cc5812f0d97113b23 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH] x86: fix laptop bootup hang in init_acpi() During kernel bootup, a new T60 laptop (CoreDuo, 32-bit) hangs about 10%-20% of the time in acpi_init(): Calling initcall 0xc055ce1a: topology_init+0x0/0x2f() Calling initcall 0xc055d75e: mtrr_init_finialize+0x0/0x2c() Calling initcall 0xc05664f3: param_sysfs_init+0x0/0x175() Calling initcall 0xc014cb65: pm_sysrq_init+0x0/0x17() Calling initcall 0xc0569f99: init_bio+0x0/0xf4() Calling initcall 0xc056b865: genhd_device_init+0x0/0x50() Calling initcall 0xc056c4bd: fbmem_init+0x0/0x87() Calling initcall 0xc056dd74: acpi_init+0x0/0x1ee() It's a hard hang that not even an NMI could punch through! Frustratingly, adding printks or function tracing to the ACPI code made the hangs go away ... After some time an additional detail emerged: disabling the NMI watchdog made these occasional hangs go away. So i spent the better part of today trying to debug this and trying out various theories when i finally found the likely reason for the hang: if acpi_ns_initialize_devices() executes an _INI AML method and an NMI happens to hit that AML execution in the wrong moment, the machine would hang. (my theory is that this must be some sort of chipset setup method doing stores to chipset mmio registers?) Unfortunately given the characteristics of the hang it was sheer impossible to figure out which of the numerous AML methods is impacted by this problem. As a workaround i wrote an interface to disable chipset-based NMIs while executing _INI sections - and indeed this fixed the hang. I did a boot-loop of 100 separate reboots and none hung - while without the patch it would hang every 5-10 attempts. Out of caution i did not touch the nmi_watchdog=2 case (it's not related to the chipset anyway and didnt hang). I implemented this for both x86_64 and i686, tested the i686 laptop both with nmi_watchdog=1 [which triggered the hangs] and nmi_watchdog=2, and tested an Athlon64 box with the 64-bit kernel as well. Everything builds and works with the patch applied. Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Len Brown Signed-off-by: Andrew Morton --- arch/i386/kernel/nmi.c | 28 ++++++++++++++++++++++++++++ arch/x86_64/kernel/nmi.c | 27 +++++++++++++++++++++++++++ drivers/acpi/namespace/nsinit.c | 9 +++++++++ include/linux/nmi.h | 9 ++++++++- 4 files changed, 72 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 7d3f4e22d6fb..b11abacc5cfd 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -383,6 +383,34 @@ void enable_timer_nmi_watchdog(void) } } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index e59cda134166..269fe585e71e 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -368,6 +368,33 @@ void enable_timer_nmi_watchdog(void) } } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ diff --git a/drivers/acpi/namespace/nsinit.c b/drivers/acpi/namespace/nsinit.c index 326af8fc0ce7..33db2241044e 100644 --- a/drivers/acpi/namespace/nsinit.c +++ b/drivers/acpi/namespace/nsinit.c @@ -45,6 +45,7 @@ #include #include #include +#include #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsinit") @@ -534,7 +535,15 @@ acpi_ns_init_one_device(acpi_handle obj_handle, info->parameter_type = ACPI_PARAM_ARGS; info->flags = ACPI_IGNORE_RETURN_VALUE; + /* + * Some hardware relies on this being executed as atomically + * as possible (without an NMI being received in the middle of + * this) - so disable NMIs and initialize the device: + */ + acpi_nmi_disable(); status = acpi_ns_evaluate(info); + acpi_nmi_enable(); + if (ACPI_SUCCESS(status)) { walk_info->num_INI++; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index acb4ed130247..29af2d5df097 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -17,8 +17,15 @@ #ifdef ARCH_HAS_NMI_WATCHDOG #include extern void touch_nmi_watchdog(void); +extern void acpi_nmi_disable(void); +extern void acpi_nmi_enable(void); #else -# define touch_nmi_watchdog() touch_softlockup_watchdog() +static inline void touch_nmi_watchdog(void) +{ + touch_softlockup_watchdog(); +} +static inline void acpi_nmi_disable(void) { } +static inline void acpi_nmi_enable(void) { } #endif #ifndef trigger_all_cpu_backtrace -- cgit v1.2.3 From ee4eff6ff6cbfc8ce38131058a18802bf6206879 Mon Sep 17 00:00:00 2001 From: Benjamin Romer Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH] x86-64: update IO-APIC dest field to 8-bit for xAPIC On the Unisys ES7000/ONE system, we encountered a problem where performing a kexec reboot or dump on any cell other than cell 0 causes the system timer to stop working, resulting in a hang during timer calibration in the new kernel. We traced the problem to one line of code in disable_IO_APIC(), which needs to restore the timer's IO-APIC configuration before rebooting. The code is currently using the 4-bit physical destination field, rather than using the 8-bit logical destination field, and it cuts off the upper 4 bits of the timer's APIC ID. If we change this to use the logical destination field, the timer works and we can kexec on the upper cells. This was tested on two different cells (0 and 2) in an ES7000/ONE system. For reference, the relevant Intel xAPIC spec is kept at ftp://download.intel.com/design/chipsets/e8501/datashts/30962001.pdf, specifically on page 334. Signed-off-by: Benjamin M Romer Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Eric W. Biederman" Cc: Vivek Goyal Signed-off-by: Andrew Morton --- arch/x86_64/kernel/io_apic.c | 24 +++++++++++------------- include/asm-x86_64/io_apic.h | 14 ++------------ 2 files changed, 13 insertions(+), 25 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 6be6730acb5c..566e64d966c4 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -831,7 +831,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.trigger = irq_trigger(idx); entry.polarity = irq_polarity(idx); @@ -839,7 +839,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) if (irq_trigger(idx)) { entry.trigger = 1; entry.mask = 1; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); } if (!apic && !IO_APIC_IRQ(irq)) @@ -851,7 +851,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) if (vector < 0) return; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.dest = cpu_mask_to_apicid(mask); entry.vector = vector; ioapic_register_intr(irq, vector, IOAPIC_AUTO); @@ -920,7 +920,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in */ entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* unmask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.delivery_mode = INT_DELIVERY_MODE; entry.polarity = 0; entry.trigger = 0; @@ -1020,18 +1020,17 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect: \n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, i); - printk(KERN_DEBUG " %02x %03X %02X ", + printk(KERN_DEBUG " %02x %03X ", i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest + entry.dest ); printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", @@ -1293,8 +1292,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest.physical.physical_dest = - GET_APIC_ID(apic_read(APIC_ID)); + entry.dest = GET_APIC_ID(apic_read(APIC_ID)); /* * Add it to the IO-APIC irq-routing table: @@ -1556,7 +1554,7 @@ static inline void unlock_ExtINT_logic(void) entry1.dest_mode = 0; /* physical delivery */ entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; entry1.trigger = 0; @@ -2131,7 +2129,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.dest = cpu_mask_to_apicid(mask); entry.trigger = triggering; entry.polarity = polarity; entry.mask = 1; /* Disabled (masked) */ diff --git a/include/asm-x86_64/io_apic.h b/include/asm-x86_64/io_apic.h index 561ecbfd4cb5..f4fb238c89f1 100644 --- a/include/asm-x86_64/io_apic.h +++ b/include/asm-x86_64/io_apic.h @@ -85,18 +85,8 @@ struct IO_APIC_route_entry { mask : 1, /* 0: enabled, 1: disabled */ __reserved_2 : 15; - union { struct { __u32 - __reserved_1 : 24, - physical_dest : 4, - __reserved_2 : 4; - } physical; - - struct { __u32 - __reserved_1 : 24, - logical_dest : 8; - } logical; - } dest; - + __u32 __reserved_3 : 24, + dest : 8; } __attribute__ ((packed)); /* -- cgit v1.2.3 From 3e94fb8f54c5305ed472e0867cd67d53e05bfb64 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH] x86-64: avoid warning message livelock I've seen my box paralyzed by an endless spew of rtc: lost some interrupts at 1024Hz. messages on the serial console. What seems to be happening is that something real causes an interrupt to be lost and triggers the message. But then printing the message to the serial console (from the hpet interrupt handler) takes more than 1/1024th of a second, and then some more interrupts are lost, so the message triggers again.... Fix this by adding a printk_ratelimit() before printing the warning. Signed-off-by: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 9c7fba30dac1..3cc6886f1fb7 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -1226,8 +1226,9 @@ static void hpet_rtc_timer_reinit(void) if (PIE_on) PIE_count += lost_ints; - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); } } -- cgit v1.2.3 From 2fb12a9bca5ad9aa6dcd2c639b4a7656a8843ef8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH] x86-64: survive having no irq mapping for a vector Occasionally the kernel has bugs that result in no irq being found for a given cpu vector. If we acknowledge the irq the system has a good chance of continuing even though we dropped an irq message. If we continue to simply print a message and not acknowledge the irq the system is likely to become non-responsive shortly there after. AK: Fixed compilation for UP kernels Signed-off-by: Eric W. Biederman Signed-off-by: Andi Kleen Cc: "Luigi Genoni" Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/irq.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 0c06af6c13bc..3bc30d2c13d3 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -18,6 +18,7 @@ #include #include #include +#include atomic_t irq_err_count; @@ -120,9 +121,14 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) if (likely(irq < NR_IRQS)) generic_handle_irq(irq); - else if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", - __func__, smp_processor_id(), vector); + else { + if (!disable_apic) + ack_APIC_irq(); + + if (printk_ratelimit()) + printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", + __func__, smp_processor_id(), vector); + } irq_exit(); -- cgit v1.2.3 From f790cd30d002949a12623b2a8cec4d4e5a8887ef Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH] x86: Add new CPUID bits for AMD Family 10 CPUs in /proc/cpuinfo Just various new acronyms. The new popcnt bit is in the middle of Intel space. This looks a little weird, but I've been assured it's ok. Also I fixed RDTSCP for i386 which was at the wrong place. For i386 and x86-64. Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/proc.c | 14 +++++++++----- arch/x86_64/kernel/setup.c | 14 ++++++++++---- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 6624d8583c42..47e3ebbfb28d 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -29,7 +29,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -47,7 +47,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -57,8 +57,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", NULL, "cr8legacy", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm", + "sse4a", "misalignsse", + "3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; @@ -69,8 +70,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) "ttp", /* thermal trip */ "tm", "stc", + "100mhzsteps", + "hwpstate", NULL, - /* nothing */ /* constant_tsc - moved to flags */ + NULL, /* constant_tsc - moved to flags */ + /* nothing */ }; struct cpuinfo_x86 *c = v; int i, n = c - cpu_data; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index e82c1a155af5..13daaf359d21 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -942,7 +942,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", + "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -960,7 +961,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -970,8 +971,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy", + "altmovcr8", "abm", "sse4a", + "misalignsse", "3dnowprefetch", + "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; @@ -982,6 +985,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) "ttp", /* thermal trip */ "tm", "stc", + "100mhzsteps", + "hwpstate", + NULL, /* tsc invariant mapped to constant_tsc */ NULL, /* nothing */ /* constant_tsc - moved to flags */ }; -- cgit v1.2.3 From 0a4599c894d880763eec6cb93f6c246dac6c3269 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH] x86: Enable NMI watchdog for AMD Family 0x10 CPUs For i386/x86-64. Straight forward -- just reuse the Family 0xf code. Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 6 ++++-- arch/x86_64/kernel/nmi.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index b11abacc5cfd..5d8a07c20281 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -185,7 +185,8 @@ static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6) + || (boot_cpu_data.x86 == 16)); case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return 1; @@ -817,7 +818,8 @@ void setup_apic_nmi_watchdog (void *unused) if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && + boot_cpu_data.x86 != 16) return; if (!setup_k7_watchdog()) return; diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 269fe585e71e..486f4c61a948 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -172,7 +172,7 @@ static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return boot_cpu_data.x86 == 15; + return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return 1; -- cgit v1.2.3 From 62cc49396e593dd71c6595302bb10b085aefbfa5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH] x86: Unify pcspeaker platform device code between i386/x86-64 Trivial cleanup. Only change is that it is always compiled in now on x86-64 like on i386. Signed-off-by: Andi Kleen --- arch/i386/kernel/Makefile | 1 + arch/i386/kernel/pcspeaker.c | 20 ++++++++++++++++++++ arch/i386/kernel/setup.c | 26 -------------------------- arch/x86_64/kernel/Makefile | 2 ++ arch/x86_64/kernel/setup.c | 20 -------------------- 5 files changed, 23 insertions(+), 46 deletions(-) create mode 100644 arch/i386/kernel/pcspeaker.c (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 223969377de4..cbe4e601885c 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_VMI) += vmi.o vmitime.o obj-$(CONFIG_PARAVIRT) += paravirt.o +obj-y += pcspeaker.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c new file mode 100644 index 000000000000..bc1f2d3ea277 --- /dev/null +++ b/arch/i386/kernel/pcspeaker.c @@ -0,0 +1,20 @@ +#include +#include +#include + +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + int ret; + + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; + + ret = platform_device_add(pd); + if (ret) + platform_device_put(pd); + + return ret; +} +device_initcall(add_pcspkr); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index bd8c218d94af..122623dcc6e1 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -660,28 +659,3 @@ void __init setup_arch(char **cmdline_p) #endif tsc_init(); } - -static __init int add_pcspkr(void) -{ - struct platform_device *pd; - int ret; - - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; - - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; -} -device_initcall(add_pcspkr); - -/* - * Local Variables: - * mode:c - * c-file-style:"k&r" - * c-basic-offset:8 - * End: - */ diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 3c7cbff04d3d..ae399458024b 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o +obj-y += pcspeaker.o CFLAGS_vsyscall.o := $(PROFILING) -g0 @@ -56,3 +57,4 @@ quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o alternative-y += ../../i386/kernel/alternative.o +pcspeaker-y += ../../i386/kernel/pcspeaker.o diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 13daaf359d21..3d98b696881d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -1104,23 +1104,3 @@ struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; - -#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE) -#include -static __init int add_pcspkr(void) -{ - struct platform_device *pd; - int ret; - - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; - - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; -} -device_initcall(add_pcspkr); -#endif -- cgit v1.2.3