From 0812a579c92fefa57506821fa08e90f47cb6dbdd Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 13 Feb 2007 13:26:19 +0100
Subject: [PATCH] x86-64: Add __copy_from_user_nocache

This does user copies in fs write() into the page cache with write combining.
This pushes the destination out of the CPU's cache, but allows higher bandwidth
in some case.

The theory is that the page cache data is usually not touched by the
CPU again and it's better to not pollute the cache with it. Also it is a little
faster.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/x8664_ksyms.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 6d77e4797a47..23a7da312f30 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
 EXPORT_SYMBOL(__put_user_8);
 
 EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(copy_from_user);
 EXPORT_SYMBOL(copy_to_user);
 EXPORT_SYMBOL(__copy_from_user_inatomic);
-- 
cgit v1.2.3


From 076422d2af7e3d8e72c6e70843f6ea377714b082 Mon Sep 17 00:00:00 2001
From: Amul Shah <amul.shah@unisys.com>
Date: Tue, 13 Feb 2007 13:26:19 +0100
Subject: [PATCH] x86-64: Allocate the NUMA hash function nodemap dynamically

Remove the statically allocated memory to NUMA node hash map in favor of a
dynamically allocated memory to node hash map (it is cache aligned).

This patch has the nice side effect in that it allows the hash map to grow
for systems with large amounts of memory (256GB - 1TB), but suffer from
having small PCI space tacked onto the boot node (which is somewhere
between 192MB to 512MB on the ES7000).

Signed-off-by: Amul Shah <amul.shah@unisys.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/e820.c   |  7 +++++
 arch/x86_64/kernel/setup.c  |  5 +++
 arch/x86_64/mm/numa.c       | 74 +++++++++++++++++++++++++++++++++++++++------
 include/asm-x86_64/e820.h   |  1 +
 include/asm-x86_64/mmzone.h | 13 ++++----
 5 files changed, 85 insertions(+), 15 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 6fe191c58084..9d67955bbc31 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
 		return 1;
 	}
 
+#ifdef CONFIG_NUMA
+	/* NUMA memory to node map */
+	if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+		*addrp = nodemap_addr + nodemap_size;
+		return 1;
+	}
+#endif
 	/* XXX ramdisk image here? */ 
 	return 0;
 } 
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 60477244d1a3..f330f8285499 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -444,6 +444,11 @@ void __init setup_arch(char **cmdline_p)
 	/* reserve ebda region */
 	if (ebda_addr)
 		reserve_bootmem_generic(ebda_addr, ebda_size);
+#ifdef CONFIG_NUMA
+	/* reserve nodemap region */
+	if (nodemap_addr)
+		reserve_bootmem_generic(nodemap_addr, nodemap_size);
+#endif
 
 #ifdef CONFIG_SMP
 	/*
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 2ee2e003606c..7d9c428f4094 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
 
 int numa_off __initdata;
+unsigned long __initdata nodemap_addr;
+unsigned long __initdata nodemap_size;
 
 
 /*
@@ -52,34 +54,87 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
 	int res = -1;
 	unsigned long addr, end;
 
-	if (shift >= 64)
-		return -1;
-	memset(memnodemap, 0xff, sizeof(memnodemap));
+	memset(memnodemap, 0xff, memnodemapsize);
 	for (i = 0; i < numnodes; i++) {
 		addr = nodes[i].start;
 		end = nodes[i].end;
 		if (addr >= end)
 			continue;
-		if ((end >> shift) >= NODEMAPSIZE)
+		if ((end >> shift) >= memnodemapsize)
 			return 0;
 		do {
 			if (memnodemap[addr >> shift] != 0xff)
 				return -1;
 			memnodemap[addr >> shift] = i;
-                       addr += (1UL << shift);
+			addr += (1UL << shift);
 		} while (addr < end);
 		res = 1;
 	} 
 	return res;
 }
 
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+static int __init allocate_cachealigned_memnodemap(void)
+{
+	unsigned long pad, pad_addr;
+
+	memnodemap = memnode.embedded_map;
+	if (memnodemapsize <= 48) {
+		printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+		       nodemap_addr, nodemap_addr + nodemap_size);
+		return 0;
+	}
+
+	pad = L1_CACHE_BYTES - 1;
+	pad_addr = 0x8000;
+	nodemap_size = pad + memnodemapsize;
+	nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
+				      nodemap_size);
+	if (nodemap_addr == -1UL) {
+		printk(KERN_ERR
+		       "NUMA: Unable to allocate Memory to Node hash map\n");
+		nodemap_addr = nodemap_size = 0;
+		return -1;
+	}
+	pad_addr = (nodemap_addr + pad) & ~pad;
+	memnodemap = phys_to_virt(pad_addr);
+
+	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+	       nodemap_addr, nodemap_addr + nodemap_size);
+	return 0;
+}
+
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init
+extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
 {
-	int shift = 20;
+	int i;
+	unsigned long start, end;
+	unsigned long bitfield = 0, memtop = 0;
 
-	while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
-		shift++;
+	for (i = 0; i < numnodes; i++) {
+		start = nodes[i].start;
+		end = nodes[i].end;
+		if (start >= end)
+			continue;
+		bitfield |= start | end;
+		if (end > memtop)
+			memtop = end;
+	}
+	i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+	memnodemapsize = (memtop >> i)+1;
+	return i;
+}
+
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+{
+	int shift;
 
+	shift = extract_lsb_from_nodes(nodes, numnodes);
+	if (allocate_cachealigned_memnodemap())
+		return -1;
 	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 		shift);
 
@@ -290,6 +345,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	       end_pfn << PAGE_SHIFT); 
 		/* setup dummy node covering all memory */ 
 	memnode_shift = 63; 
+	memnodemap = memnode.embedded_map;
 	memnodemap[0] = 0;
 	nodes_clear(node_online_map);
 	node_set_online(0);
diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h
index fa2086774105..855fb4a454b6 100644
--- a/include/asm-x86_64/e820.h
+++ b/include/asm-x86_64/e820.h
@@ -56,6 +56,7 @@ extern void finish_e820_parsing(void);
 extern struct e820map e820;
 
 extern unsigned ebda_addr, ebda_size;
+extern unsigned long nodemap_addr, nodemap_size;
 #endif/*!__ASSEMBLY__*/
 
 #endif/*__E820_HEADER*/
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index c38ebdf6f426..39ef106986eb 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -11,24 +11,25 @@
 
 #include <asm/smp.h>
 
-/* Should really switch to dynamic allocation at some point */
-#define NODEMAPSIZE 0x4fff
-
 /* Simple perfect hash to map physical addresses to node numbers */
 struct memnode {
 	int shift;
-	u8 map[NODEMAPSIZE];
-} ____cacheline_aligned;
+	unsigned int mapsize;
+	u8 *map;
+	u8 embedded_map[64-16];
+} ____cacheline_aligned; /* total size = 64 bytes */
 extern struct memnode memnode;
 #define memnode_shift memnode.shift
 #define memnodemap memnode.map
+#define memnodemapsize memnode.mapsize
 
 extern struct pglist_data *node_data[];
 
 static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) 
 { 
 	unsigned nid; 
-	VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+	VIRTUAL_BUG_ON(!memnodemap);
+	VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize);
 	nid = memnodemap[addr >> memnode_shift]; 
 	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
 	return nid; 
-- 
cgit v1.2.3


From 5558870bfbcca10cfc7b13ab866687012ea3c9af Mon Sep 17 00:00:00 2001
From: Karsten Weiss <K.Weiss@science-computing.de>
Date: Tue, 13 Feb 2007 13:26:21 +0100
Subject: [PATCH] x86-64: improved iommu documentation

- add SWIOTLB config help text
- mention Documentation/x86_64/boot-options.txt in
  Documentation/kernel-parameters.txt
- remove the duplication of the iommu kernel parameter documentation.
- Better explanation of some of the iommu kernel parameter options.
- "32MB<<order" instead of "32MB^order".
- Mention the default "order" value.
- list the four existing PCI-DMA mapping implementations of arch x86_64
- group the iommu= option keywords by PCI-DMA mapping implementation.
- Distinguish iommu= option keywords from number arguments.
- Explain the meaning of DAC and SAC.

Signed-off-by: Karsten Weiss <knweiss@science-computing.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Muli Ben-Yehuda <muli@il.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 Documentation/kernel-parameters.txt   |   3 +
 Documentation/x86_64/boot-options.txt | 109 +++++++++++++++++++++++-----------
 arch/x86_64/Kconfig                   |  10 +++-
 arch/x86_64/kernel/pci-dma.c          |  28 ++-------
 4 files changed, 90 insertions(+), 60 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index d25acd51e181..733a736bc6c8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -104,6 +104,9 @@ loader, and have no meaning to the kernel directly.
 Do not modify the syntax of boot loader parameters without extreme
 need or coordination with <Documentation/i386/boot.txt>.
 
+There are also arch-specific kernel-parameters not documented here.
+See for example <Documentation/x86_64/boot-options.txt>.
+
 Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
 a trailing = on the name of any parameter states that that parameter will
 be entered as an environment variable, whereas its absence indicates that
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 5c86ed6f0448..0d653993f361 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -180,40 +180,81 @@ PCI
   pci=lastbus=NUMBER	       Scan upto NUMBER busses, no matter what the mptable says.
   pci=noacpi		Don't use ACPI to set up PCI interrupt routing.
 
-IOMMU
-
- iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
-         [,forcesac][,fullflush][,nomerge][,noaperture][,calgary]
-   size  set size of iommu (in bytes)
-   noagp don't initialize the AGP driver and use full aperture.
-   off   don't use the IOMMU
-   leak  turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
-   memaper[=order] allocate an own aperture over RAM with size 32MB^order.
-   noforce don't force IOMMU usage. Default.
-   force  Force IOMMU.
-   merge  Do SG merging. Implies force (experimental)
-   nomerge Don't do SG merging.
-   forcesac For SAC mode for masks <40bits  (experimental)
-   fullflush Flush IOMMU on each allocation (default)
-   nofullflush Don't use IOMMU fullflush
-   allowed  overwrite iommu off workarounds for specific chipsets.
-   soft	 Use software bounce buffering (default for Intel machines)
-   noaperture Don't touch the aperture for AGP.
-   allowdac Allow DMA >4GB
-	    When off all DMA over >4GB is forced through an IOMMU or bounce
-	    buffering.
-   nodac    Forbid DMA >4GB
-   panic    Always panic when IOMMU overflows
-   calgary  Use the Calgary IOMMU if it is available
-
-  swiotlb=pages[,force]
-
-  pages  Prereserve that many 128K pages for the software IO bounce buffering.
-  force  Force all IO through the software TLB.
-
-  calgary=[64k,128k,256k,512k,1M,2M,4M,8M]
-  calgary=[translate_empty_slots]
-  calgary=[disable=<PCI bus number>]
+IOMMU (input/output memory management unit)
+
+ Currently four x86-64 PCI-DMA mapping implementations exist:
+
+   1. <arch/x86_64/kernel/pci-nommu.c>: use no hardware/software IOMMU at all
+      (e.g. because you have < 3 GB memory).
+      Kernel boot message: "PCI-DMA: Disabling IOMMU"
+
+   2. <arch/x86_64/kernel/pci-gart.c>: AMD GART based hardware IOMMU.
+      Kernel boot message: "PCI-DMA: using GART IOMMU"
+
+   3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used
+      e.g. if there is no hardware IOMMU in the system and it is need because
+      you have >3GB memory or told the kernel to us it (iommu=soft))
+      Kernel boot message: "PCI-DMA: Using software bounce buffering
+      for IO (SWIOTLB)"
+
+   4. <arch/x86_64/pci-calgary.c> : IBM Calgary hardware IOMMU. Used in IBM
+      pSeries and xSeries servers. This hardware IOMMU supports DMA address
+      mapping with memory protection, etc.
+      Kernel boot message: "PCI-DMA: Using Calgary IOMMU"
+
+ iommu=[<size>][,noagp][,off][,force][,noforce][,leak[=<nr_of_leak_pages>]
+	[,memaper[=<order>]][,merge][,forcesac][,fullflush][,nomerge]
+	[,noaperture][,calgary]
+
+  General iommu options:
+    off                Don't initialize and use any kind of IOMMU.
+    noforce            Don't force hardware IOMMU usage when it is not needed.
+                       (default).
+    force              Force the use of the hardware IOMMU even when it is
+                       not actually needed (e.g. because < 3 GB memory).
+    soft               Use software bounce buffering (SWIOTLB) (default for
+                       Intel machines). This can be used to prevent the usage
+                       of an available hardware IOMMU.
+
+  iommu options only relevant to the AMD GART hardware IOMMU:
+    <size>             Set the size of the remapping area in bytes.
+    allowed            Overwrite iommu off workarounds for specific chipsets.
+    fullflush          Flush IOMMU on each allocation (default).
+    nofullflush        Don't use IOMMU fullflush.
+    leak               Turn on simple iommu leak tracing (only when
+                       CONFIG_IOMMU_LEAK is on). Default number of leak pages
+                       is 20.
+    memaper[=<order>]  Allocate an own aperture over RAM with size 32MB<<order.
+                       (default: order=1, i.e. 64MB)
+    merge              Do scather-gather (SG) merging. Implies "force"
+                       (experimental).
+    nomerge            Don't do scather-gather (SG) merging.
+    noaperture         Ask the IOMMU not to touch the aperture for AGP.
+    forcesac           Force single-address cycle (SAC) mode for masks <40bits
+                       (experimental).
+    noagp              Don't initialize the AGP driver and use full aperture.
+    allowdac           Allow double-address cycle (DAC) mode, i.e. DMA >4GB.
+                       DAC is used with 32-bit PCI to push a 64-bit address in
+                       two cycles. When off all DMA over >4GB is forced through
+                       an IOMMU or software bounce buffering.
+    nodac              Forbid DAC mode, i.e. DMA >4GB.
+    panic              Always panic when IOMMU overflows.
+    calgary            Use the Calgary IOMMU if it is available
+
+  iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
+  implementation:
+    swiotlb=<pages>[,force]
+    <pages>            Prereserve that many 128K pages for the software IO
+                       bounce buffering.
+    force              Force all IO through the software TLB.
+
+  Settings for the IBM Calgary hardware IOMMU currently found in IBM
+  pSeries and xSeries machines:
+
+    calgary=[64k,128k,256k,512k,1M,2M,4M,8M]
+    calgary=[translate_empty_slots]
+    calgary=[disable=<PCI bus number>]
+    panic              Always panic when IOMMU overflows
 
     64k,...,8M - Set the size of each PCI slot's translation table
     when using the Calgary IOMMU. This is the size of the translation
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 02dd39457bcf..a55382a1bb46 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -458,8 +458,8 @@ config IOMMU
 	  on systems with more than 3GB. This is usually needed for USB,
 	  sound, many IDE/SATA chipsets and some other devices.
 	  Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
-	  based IOMMU and a software bounce buffer based IOMMU used on Intel
-	  systems and as fallback.
+	  based hardware IOMMU and a software bounce buffer based IOMMU used
+	  on Intel systems and as fallback.
 	  The code is only active when needed (enough memory and limited
 	  device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
 	  too.
@@ -496,6 +496,12 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	bool
+	help
+	  Support for software bounce buffers used on x86-64 systems
+	  which don't have a hardware IOMMU (e.g. the current generation
+	  of Intel's x86-64 CPUs). Using this PCI devices which can only
+	  access 32-bits of memory can be used on systems with more than
+	  3 GB of memory. If unsure, say Y.
 
 config X86_MCE
 	bool "Machine check support" if EMBEDDED
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 683b7a5c1ab3..651ccfb06697 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -223,30 +223,10 @@ int dma_set_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_mask);
 
-/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
-         [,forcesac][,fullflush][,nomerge][,biomerge]
-   size  set size of iommu (in bytes)
-   noagp don't initialize the AGP driver and use full aperture.
-   off   don't use the IOMMU
-   leak  turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
-   memaper[=order] allocate an own aperture over RAM with size 32MB^order.
-   noforce don't force IOMMU usage. Default.
-   force  Force IOMMU.
-   merge  Do lazy merging. This may improve performance on some block devices.
-          Implies force (experimental)
-   biomerge Do merging at the BIO layer. This is more efficient than merge,
-            but should be only done with very big IOMMUs. Implies merge,force.
-   nomerge Don't do SG merging.
-   forcesac For SAC mode for masks <40bits  (experimental)
-   fullflush Flush IOMMU on each allocation (default)
-   nofullflush Don't use IOMMU fullflush
-   allowed  overwrite iommu off workarounds for specific chipsets.
-   soft	 Use software bounce buffering (default for Intel machines)
-   noaperture Don't touch the aperture for AGP.
-   allowdac Allow DMA >4GB
-   nodac    Forbid DMA >4GB
-   panic    Force panic when IOMMU overflows
-*/
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
 __init int iommu_setup(char *p)
 {
 	iommu_merge = 1;
-- 
cgit v1.2.3


From 006e84ee3a54e393ec6bef2a9bc891dc5bde2843 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 13 Feb 2007 13:26:21 +0100
Subject: [PATCH] x86-64: do not always end the stack trace with ULONG_MAX

It makes more sense to end the stack trace with ULONG_MAX only if
nr_entries < max_entries.  Otherwise, we lose one entry in the long stack
traces and cannot know whether the trace was complete or not.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/stacktrace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
index 6026b31d037e..65ac2c6b34a6 100644
--- a/arch/x86_64/kernel/stacktrace.c
+++ b/arch/x86_64/kernel/stacktrace.c
@@ -32,7 +32,7 @@ static void save_stack_address(void *data, unsigned long addr)
 		trace->skip--;
 		return;
 	}
-	if (trace->nr_entries < trace->max_entries - 1)
+	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = addr;
 }
 
@@ -49,7 +49,8 @@ static struct stacktrace_ops save_stack_ops = {
 void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
 {
 	dump_trace(task, NULL, NULL, &save_stack_ops, trace);
-	trace->entries[trace->nr_entries++] = ULONG_MAX;
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL(save_stack_trace);
 
-- 
cgit v1.2.3


From 53fee04f318222a3179ca5933d8bda82c1eef17a Mon Sep 17 00:00:00 2001
From: Rohit Seth <rohitseth@google.com>
Date: Tue, 13 Feb 2007 13:26:22 +0100
Subject: [PATCH] x86-64: Fix fake numa for x86_64 machines with big IO hole

This patch resolves the issue of running with numa=fake=X on kernel command
line on x86_64 machines that have big IO hole.  While calculating the size
of each node now we look at the total hole size in that range.

Previously there were nodes that only had IO holes in them causing kernel
boot problems.  We now use the NODE_MIN_SIZE (64MB) as the minimum size of
memory that any node must have.  We reduce the number of allocated nodes if
the number of nodes specified on kernel command line results in any node
getting memory smaller than NODE_MIN_SIZE.

This change allows the extra memory to be incremented in NODE_MIN_SIZE
granule and uniformly distribute among as many nodes (called big nodes) as
possible.

[akpm@osdl.org: build fix]
Signed-off-by: David Rientjes <reintjes@google.com>
Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/e820.c   |  31 +++++++++++++
 arch/x86_64/mm/numa.c       | 110 ++++++++++++++++++++++++++++++++++++++------
 include/asm-x86_64/e820.h   |   1 +
 include/asm-x86_64/mmzone.h |   5 ++
 4 files changed, 133 insertions(+), 14 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 9d67955bbc31..4651fd22b213 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -190,6 +190,37 @@ unsigned long __init e820_end_of_ram(void)
 	return end_pfn;	
 }
 
+/*
+ * Find the hole size in the range.
+ */
+unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
+{
+	unsigned long ram = 0;
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		unsigned long last, addr;
+
+		if (ei->type != E820_RAM ||
+		    ei->addr+ei->size <= start ||
+		    ei->addr >= end)
+			continue;
+
+		addr = round_up(ei->addr, PAGE_SIZE);
+		if (addr < start)
+			addr = start;
+
+		last = round_down(ei->addr + ei->size, PAGE_SIZE);
+		if (last >= end)
+			last = end;
+
+		if (last > addr)
+			ram += last - addr;
+	}
+	return ((end - start) - ram);
+}
+
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 1ec16ea97519..d3f747dd61d3 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -272,31 +272,113 @@ void __init numa_init_array(void)
 }
 
 #ifdef CONFIG_NUMA_EMU
+/* Numa emulation */
 int numa_fake __initdata = 0;
 
-/* Numa emulation */
+/*
+ * This function is used to find out if the start and end correspond to
+ * different zones.
+ */
+int zone_cross_over(unsigned long start, unsigned long end)
+{
+	if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
+			(end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
+		return 1;
+	return 0;
+}
+
 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
 {
- 	int i;
+ 	int i, big;
  	struct bootnode nodes[MAX_NUMNODES];
- 	unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+ 	unsigned long sz, old_sz;
+	unsigned long hole_size;
+	unsigned long start, end;
+	unsigned long max_addr = (end_pfn << PAGE_SHIFT);
+
+	start = (start_pfn << PAGE_SHIFT);
+	hole_size = e820_hole_size(start, max_addr);
+	sz = (max_addr - start - hole_size) / numa_fake;
 
  	/* Kludge needed for the hash function */
- 	if (hweight64(sz) > 1) {
- 		unsigned long x = 1;
- 		while ((x << 1) < sz)
- 			x <<= 1;
- 		if (x < sz/2)
- 			printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
- 		sz = x;
- 	}
 
+	old_sz = sz;
+	/*
+	 * Round down to the nearest FAKE_NODE_MIN_SIZE.
+	 */
+	sz &= FAKE_NODE_MIN_HASH_MASK;
+
+	/*
+	 * We ensure that each node is at least 64MB big.  Smaller than this
+	 * size can cause VM hiccups.
+	 */
+	if (sz == 0) {
+		printk(KERN_INFO "Not enough memory for %d nodes.  Reducing "
+				"the number of nodes\n", numa_fake);
+		numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
+		printk(KERN_INFO "Number of fake nodes will be = %d\n",
+				numa_fake);
+		sz = FAKE_NODE_MIN_SIZE;
+	}
+	/*
+	 * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
+	 * This logic ensures the extra memory gets distributed among as many
+	 * nodes as possible (as compared to one single node getting all that
+	 * extra memory.
+	 */
+	big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
+	printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
+			"%d\n",
+			(sz >> 20), (hole_size >> 20), big);
  	memset(&nodes,0,sizeof(nodes));
+	end = start;
  	for (i = 0; i < numa_fake; i++) {
- 		nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+		/*
+		 * In case we are not able to allocate enough memory for all
+		 * the nodes, we reduce the number of fake nodes.
+		 */
+		if (end >= max_addr) {
+			numa_fake = i - 1;
+			break;
+		}
+ 		start = nodes[i].start = end;
+		/*
+		 * Final node can have all the remaining memory.
+		 */
  		if (i == numa_fake-1)
- 			sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
- 		nodes[i].end = nodes[i].start + sz;
+ 			sz = max_addr - start;
+ 		end = nodes[i].start + sz;
+		/*
+		 * Fir "big" number of nodes get extra granule.
+		 */
+		if (i < big)
+			end += FAKE_NODE_MIN_SIZE;
+		/*
+		 * Iterate over the range to ensure that this node gets at
+		 * least sz amount of RAM (excluding holes)
+		 */
+		while ((end - start - e820_hole_size(start, end)) < sz) {
+			end += FAKE_NODE_MIN_SIZE;
+			if (end >= max_addr)
+				break;
+		}
+		/*
+		 * Look at the next node to make sure there is some real memory
+		 * to map.  Bad things happen when the only memory present
+		 * in a zone on a fake node is IO hole.
+		 */
+		while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
+			if (zone_cross_over(start, end + sz)) {
+				end = (MAX_DMA32_PFN << PAGE_SHIFT);
+				break;
+			}
+			if (end >= max_addr)
+				break;
+			end += FAKE_NODE_MIN_SIZE;
+		}
+		if (end > max_addr)
+			end = max_addr;
+		nodes[i].end = end;
  		printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
  		       i,
  		       nodes[i].start, nodes[i].end,
diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h
index 855fb4a454b6..6216fa3f2802 100644
--- a/include/asm-x86_64/e820.h
+++ b/include/asm-x86_64/e820.h
@@ -46,6 +46,7 @@ extern void e820_mark_nosave_regions(void);
 extern void e820_print_map(char *who);
 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
+extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
 
 extern void e820_setup_gap(void);
 extern void e820_register_active_regions(int nid,
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index 39ef106986eb..fb558fb1d211 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -47,5 +47,10 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 extern int pfn_valid(unsigned long pfn);
 #endif
 
+#ifdef CONFIG_NUMA_EMU
+#define FAKE_NODE_MIN_SIZE	(64*1024*1024)
+#define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1ul))
+#endif
+
 #endif
 #endif
-- 
cgit v1.2.3


From c49c5330c9592f29a69bb2ea8f6e7fd5d9c151e8 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Tue, 13 Feb 2007 13:26:22 +0100
Subject: [PATCH] x86-64: Remove fastcall references in x86_64 code

Unlike x86, x86_64 already passes arguments in registers.  The use of
regparm attribute makes no difference in produced code, and the use of
fastcall just bloats the code.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/acpi/sleep.c  | 2 +-
 arch/x86_64/kernel/x8664_ksyms.c | 4 ++--
 include/asm-x86_64/hw_irq.h      | 2 +-
 include/asm-x86_64/mutex.h       | 6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
index 5ebf62c7a3d2..23178ce6c783 100644
--- a/arch/x86_64/kernel/acpi/sleep.c
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -58,7 +58,7 @@ unsigned long acpi_wakeup_address = 0;
 unsigned long acpi_video_flags;
 extern char wakeup_start, wakeup_end;
 
-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 
 static pgd_t low_ptr;
 
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 23a7da312f30..0dffae69f4ad 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -35,8 +35,8 @@ EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
 #ifdef CONFIG_SMP
-extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
-extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
+extern void  __write_lock_failed(rwlock_t *rw);
+extern void  __read_lock_failed(rwlock_t *rw);
 EXPORT_SYMBOL(__write_lock_failed);
 EXPORT_SYMBOL(__read_lock_failed);
 #endif
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index 179cce755aa7..552df5f10a6d 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -91,7 +91,7 @@ extern void enable_8259A_irq(unsigned int irq);
 extern int i8259A_irq_pending(unsigned int irq);
 extern void make_8259A_irq(unsigned int irq);
 extern void init_8259A(int aeoi);
-extern void FASTCALL(send_IPI_self(int vector));
+extern void send_IPI_self(int vector);
 extern void init_VISWS_APIC_irqs(void);
 extern void setup_IO_APIC(void);
 extern void disable_IO_APIC(void);
diff --git a/include/asm-x86_64/mutex.h b/include/asm-x86_64/mutex.h
index 16396b1de3e4..6c2949a3c677 100644
--- a/include/asm-x86_64/mutex.h
+++ b/include/asm-x86_64/mutex.h
@@ -21,7 +21,7 @@ do {									\
 	unsigned long dummy;						\
 									\
 	typecheck(atomic_t *, v);					\
-	typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);		\
+	typecheck_fn(void (*)(atomic_t *), fail_fn);			\
 									\
 	__asm__ __volatile__(						\
 		LOCK_PREFIX "   decl (%%rdi)	\n"			\
@@ -47,7 +47,7 @@ do {									\
  */
 static inline int
 __mutex_fastpath_lock_retval(atomic_t *count,
-			     int fastcall (*fail_fn)(atomic_t *))
+			     int (*fail_fn)(atomic_t *))
 {
 	if (unlikely(atomic_dec_return(count) < 0))
 		return fail_fn(count);
@@ -67,7 +67,7 @@ do {									\
 	unsigned long dummy;						\
 									\
 	typecheck(atomic_t *, v);					\
-	typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);		\
+	typecheck_fn(void (*)(atomic_t *), fail_fn);			\
 									\
 	__asm__ __volatile__(						\
 		LOCK_PREFIX "   incl (%%rdi)	\n"			\
-- 
cgit v1.2.3


From 4c3cbf75b262433afc90b5c35510d1e5744d3b94 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Tue, 13 Feb 2007 13:26:22 +0100
Subject: [PATCH] x86-64: Use constant instead of raw number in x86_64 ioperm.c

This is a tiny cleanup to increase readability

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/ioport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index fe063d3cfe42..745b1f0f494e 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -114,6 +114,6 @@ asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 	}
-	regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
+	regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
 	return 0;
 }
-- 
cgit v1.2.3


From 1676193937a538fdb92a2916a86a705093cfd613 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Tue, 13 Feb 2007 13:26:22 +0100
Subject: [PATCH] x86-64: Handle 32 bit PerfMon Counter writes cleanly in
 x86_64 nmi_watchdog

P6 CPUs and Core/Core 2 CPUs which has 'architectural perf mon' feature,
only supports write of low 32 bits in Performance Monitoring Counters.
Bits 32..39 are sign extended based on bit 31 and bits 40..63 are reserved
and should be zero.

This patch:

Change x86_64 nmi handler to handle this case cleanly.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/nmi.c | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 9cb42ecb7f89..e59cda134166 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -214,6 +214,23 @@ static __init void nmi_cpu_busy(void *data)
 }
 #endif
 
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+	unsigned int retval = hz;
+
+	/*
+	 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
+	 * are writable, with higher bits sign extending from bit 31.
+	 * So, we can only program the counter with 31 bit values and
+	 * 32nd bit should be 1, for 33.. to be 1.
+	 * Find the appropriate nmi_hz
+	 */
+ 	if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) {
+		retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
+	}
+	return retval;
+}
+
 int __init check_nmi_watchdog (void)
 {
 	int *counts;
@@ -268,17 +285,8 @@ int __init check_nmi_watchdog (void)
 		struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 
 		nmi_hz = 1;
-		/*
-		 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
-		 * are writable, with higher bits sign extending from bit 31.
-		 * So, we can only program the counter with 31 bit values and
-		 * 32nd bit should be 1, for 33.. to be 1.
-		 * Find the appropriate nmi_hz
-		 */
-	 	if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
-			((u64)cpu_khz * 1000) > 0x7fffffffULL) {
-			nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
-		}
+	 	if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0)
+			nmi_hz = adjust_for_32bit_ctr(nmi_hz);
 	}
 
 	kfree(counts);
@@ -634,7 +642,9 @@ static int setup_intel_arch_watchdog(void)
 
 	/* setup the timer */
 	wrmsr(evntsel_msr, evntsel, 0);
-	wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+
+	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+	wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
@@ -855,15 +865,23 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 				dummy &= ~P4_CCCR_OVF;
 	 			wrmsrl(wd->cccr_msr, dummy);
 	 			apic_write(APIC_LVTPC, APIC_DM_NMI);
+				/* start the cycle over again */
+				wrmsrl(wd->perfctr_msr,
+				       -((u64)cpu_khz * 1000 / nmi_hz));
 	 		} else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
 				/*
 				 * ArchPerfom/Core Duo needs to re-unmask
 				 * the apic vector
 				 */
 				apic_write(APIC_LVTPC, APIC_DM_NMI);
+				/* ARCH_PERFMON has 32 bit counter writes */
+				wrmsr(wd->perfctr_msr,
+				     (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
+			} else {
+				/* start the cycle over again */
+				wrmsrl(wd->perfctr_msr,
+				       -((u64)cpu_khz * 1000 / nmi_hz));
 			}
-			/* start the cycle over again */
-			wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
 			rc = 1;
 		} else 	if (nmi_watchdog == NMI_IO_APIC) {
 			/* don't know how to accurately check for this.
-- 
cgit v1.2.3


From 24ce0e96f2dea558762c994d054ea2f3c01fa95a Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 13 Feb 2007 13:26:23 +0100
Subject: [PATCH] x86-64: Tighten mce_amd driver MSR reads

while debugging an unrelated problem in Xen, I noticed odd reads from
non-existent MSRs. Having now found time to look why these happen, I
came up with below patch, which
- prevents accessing MCi_MISCj with j > 0 when the block pointer in
MCi_MISC0 is zero
- accesses only contiguous MCi_MISCj until a non-implemented one is
found
- doesn't touch unimplemented blocks in mce_threshold_interrupt at all
- gives names to two bits previously derived from MASK_VALID_HI (it
took me some time to understand the code without this)

The first three items, besides being apparently closer to the spec, should
namely help cutting down on the time mce_threshold_interrupt() takes.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/mce_amd.c | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 93c707257637..cd8dbe57b33a 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -37,6 +37,8 @@
 #define THRESHOLD_MAX     0xFFF
 #define INT_TYPE_APIC     0x00020000
 #define MASK_VALID_HI     0x80000000
+#define MASK_CNTP_HI      0x40000000
+#define MASK_LOCKED_HI    0x20000000
 #define MASK_LVTOFF_HI    0x00F00000
 #define MASK_COUNT_EN_HI  0x00080000
 #define MASK_INT_TYPE_HI  0x00060000
@@ -122,14 +124,17 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
 		for (block = 0; block < NR_BLOCKS; ++block) {
 			if (block == 0)
 				address = MSR_IA32_MC0_MISC + bank * 4;
-			else if (block == 1)
-				address = MCG_XBLK_ADDR
-					+ ((low & MASK_BLKPTR_LO) >> 21);
+			else if (block == 1) {
+				address = (low & MASK_BLKPTR_LO) >> 21;
+				if (!address)
+					break;
+				address += MCG_XBLK_ADDR;
+			}
 			else
 				++address;
 
 			if (rdmsr_safe(address, &low, &high))
-				continue;
+				break;
 
 			if (!(high & MASK_VALID_HI)) {
 				if (block)
@@ -138,8 +143,8 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
 					break;
 			}
 
-			if (!(high & MASK_VALID_HI >> 1)  ||
-			     (high & MASK_VALID_HI >> 2))
+			if (!(high & MASK_CNTP_HI)  ||
+			     (high & MASK_LOCKED_HI))
 				continue;
 
 			if (!block)
@@ -187,17 +192,22 @@ asmlinkage void mce_threshold_interrupt(void)
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < NR_BANKS; ++bank) {
+		if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
+			continue;
 		for (block = 0; block < NR_BLOCKS; ++block) {
 			if (block == 0)
 				address = MSR_IA32_MC0_MISC + bank * 4;
-			else if (block == 1)
-				address = MCG_XBLK_ADDR
-					+ ((low & MASK_BLKPTR_LO) >> 21);
+			else if (block == 1) {
+				address = (low & MASK_BLKPTR_LO) >> 21;
+				if (!address)
+					break;
+				address += MCG_XBLK_ADDR;
+			}
 			else
 				++address;
 
 			if (rdmsr_safe(address, &low, &high))
-				continue;
+				break;
 
 			if (!(high & MASK_VALID_HI)) {
 				if (block)
@@ -206,8 +216,8 @@ asmlinkage void mce_threshold_interrupt(void)
 					break;
 			}
 
-			if (!(high & MASK_VALID_HI >> 1)  ||
-			     (high & MASK_VALID_HI >> 2))
+			if (!(high & MASK_CNTP_HI)  ||
+			     (high & MASK_LOCKED_HI))
 				continue;
 
 			if (high & MASK_OVERFLOW_HI) {
@@ -385,7 +395,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 		return 0;
 
 	if (rdmsr_safe(address, &low, &high))
-		goto recurse;
+		return 0;
 
 	if (!(high & MASK_VALID_HI)) {
 		if (block)
@@ -394,8 +404,8 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 			return 0;
 	}
 
-	if (!(high & MASK_VALID_HI >> 1)  ||
-	     (high & MASK_VALID_HI >> 2))
+	if (!(high & MASK_CNTP_HI)  ||
+	     (high & MASK_LOCKED_HI))
 		goto recurse;
 
 	b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
-- 
cgit v1.2.3


From a98f0dd34d94ea0b5f3816196bea5dba467827bb Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 13 Feb 2007 13:26:23 +0100
Subject: [PATCH] x86-64: Allow to run a program when a machine check event is
 detected

When a machine check event is detected (including a AMD RevF threshold
overflow event) allow to run a "trigger" program. This allows user space
to react to such events sooner.

The trigger is configured using a new trigger entry in the
machinecheck sysfs interface. It is currently shared between
all CPUs.

I also fixed the AMD threshold handler to run the machine
check polling code immediately to actually log any events
that might have caused the threshold interrupt.

Also added some documentation for the mce sysfs interface.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 Documentation/x86_64/machinecheck | 70 +++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/mce.c          | 66 +++++++++++++++++++++++++++++-------
 arch/x86_64/kernel/mce_amd.c      |  4 +++
 include/asm-x86_64/mce.h          |  2 ++
 kernel/kmod.c                     | 44 ++++++++++++++++--------
 5 files changed, 160 insertions(+), 26 deletions(-)
 create mode 100644 Documentation/x86_64/machinecheck

(limited to 'arch/x86_64/kernel')

diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck
new file mode 100644
index 000000000000..068a6d9904b9
--- /dev/null
+++ b/Documentation/x86_64/machinecheck
@@ -0,0 +1,70 @@
+
+Configurable sysfs parameters for the x86-64 machine check code.
+
+Machine checks report internal hardware error conditions detected
+by the CPU. Uncorrected errors typically cause a machine check
+(often with panic), corrected ones cause a machine check log entry.
+
+Machine checks are organized in banks (normally associated with
+a hardware subsystem) and subevents in a bank. The exact meaning
+of the banks and subevent is CPU specific.
+
+mcelog knows how to decode them.
+
+When you see the "Machine check errors logged" message in the system
+log then mcelog should run to collect and decode machine check entries
+from /dev/mcelog. Normally mcelog should be run regularly from a cronjob.
+
+Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN
+(N = CPU number)
+
+The directory contains some configurable entries:
+
+Entries:
+
+bankNctl
+(N bank number)
+	64bit Hex bitmask enabling/disabling specific subevents for bank N
+	When a bit in the bitmask is zero then the respective
+	subevent will not be reported.
+	By default all events are enabled.
+	Note that BIOS maintain another mask to disable specific events
+	per bank.  This is not visible here
+
+The following entries appear for each CPU, but they are truly shared
+between all CPUs.
+
+check_interval
+	How often to poll for corrected machine check errors, in seconds
+	(Note output is hexademical). Default 5 minutes.
+
+tolerant
+	Tolerance level. When a machine check exception occurs for a non
+	corrected machine check the kernel can take different actions.
+	Since machine check exceptions can happen any time it is sometimes
+	risky for the kernel to kill a process because it defies
+	normal kernel locking rules. The tolerance level configures
+	how hard the kernel tries to recover even at some risk of deadlock.
+
+	0: always panic,
+	1: panic if deadlock possible,
+	2: try to avoid panic,
+   	3: never panic or exit (for testing only)
+
+	Default: 1
+
+	Note this only makes a difference if the CPU allows recovery
+	from a machine check exception. Current x86 CPUs generally do not.
+
+trigger
+	Program to run when a machine check event is detected.
+	This is an alternative to running mcelog regularly from cron
+	and allows to detect events faster.
+
+TBD document entries for AMD threshold interrupt configuration
+
+For more details about the x86 machine check architecture
+see the Intel and AMD architecture manuals from their developer websites.
+
+For more details about the architecture see
+see http://one.firstfloor.org/~andi/mce.pdf
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index bdb54a2c9f18..8011a8e1c7d4 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -19,6 +19,7 @@
 #include <linux/cpu.h>
 #include <linux/percpu.h>
 #include <linux/ctype.h>
+#include <linux/kmod.h>
 #include <asm/processor.h> 
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -42,6 +43,10 @@ static unsigned long console_logged;
 static int notify_user;
 static int rip_msr;
 static int mce_bootlog = 1;
+static atomic_t mce_events;
+
+static char trigger[128];
+static char *trigger_argv[2] = { trigger, NULL };
 
 /*
  * Lockless MCE logging infrastructure.
@@ -57,6 +62,7 @@ struct mce_log mcelog = {
 void mce_log(struct mce *mce)
 {
 	unsigned next, entry;
+	atomic_inc(&mce_events);
 	mce->finished = 0;
 	wmb();
 	for (;;) {
@@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 	}
 }
 
+static void do_mce_trigger(void)
+{
+	static atomic_t mce_logged;
+	int events = atomic_read(&mce_events);
+	if (events != atomic_read(&mce_logged) && trigger[0]) {
+		/* Small race window, but should be harmless.  */
+		atomic_set(&mce_logged, events);
+		call_usermodehelper(trigger, trigger_argv, NULL, -1);
+	}
+}
+
 /* 
  * The actual machine check handler
  */
@@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	}
 
 	/* Never do anything final in the polling timer */
-	if (!regs)
+	if (!regs) {
+		/* Normal interrupt context here. Call trigger for any new
+		   events. */
+		do_mce_trigger();
 		goto out;
+	}
 
 	/* If we didn't find an uncorrectable error, pick
 	   the last one (shouldn't happen, just being safe). */
@@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
 	}									   \
 	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
+/* TBD should generate these dynamically based on number of available banks */
 ACCESSOR(bank0ctl,bank[0],mce_restart())
 ACCESSOR(bank1ctl,bank[1],mce_restart())
 ACCESSOR(bank2ctl,bank[2],mce_restart())
 ACCESSOR(bank3ctl,bank[3],mce_restart())
 ACCESSOR(bank4ctl,bank[4],mce_restart())
 ACCESSOR(bank5ctl,bank[5],mce_restart())
-static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
-	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
+
+static ssize_t show_trigger(struct sys_device *s, char *buf)
+{
+	strcpy(buf, trigger);
+	strcat(buf, "\n");
+	return strlen(trigger) + 1;
+}
+
+static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+{
+	char *p;
+	int len;
+	strncpy(trigger, buf, sizeof(trigger));
+	trigger[sizeof(trigger)-1] = 0;
+	len = strlen(trigger);
+	p = strchr(trigger, '\n');
+	if (*p) *p = 0;
+	return len;
+}
+
+static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 ACCESSOR(tolerant,tolerant,)
 ACCESSOR(check_interval,check_interval,mce_restart())
+static struct sysdev_attribute *mce_attributes[] = {
+	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
+	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
+	&attr_tolerant, &attr_check_interval, &attr_trigger,
+	NULL
+};
 
 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 static __cpuinit int mce_create_device(unsigned int cpu)
@@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 	err = sysdev_register(&per_cpu(device_mce,cpu));
 
 	if (!err) {
-		for (i = 0; i < banks; i++)
+		for (i = 0; mce_attributes[i]; i++)
 			sysdev_create_file(&per_cpu(device_mce,cpu),
-				bank_attributes[i]);
-		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
-		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+				mce_attributes[i]);
 	}
 	return err;
 }
@@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu)
 {
 	int i;
 
-	for (i = 0; i < banks; i++)
+	for (i = 0; mce_attributes[i]; i++)
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
-			bank_attributes[i]);
-	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
-	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+			mce_attributes[i]);
 	sysdev_unregister(&per_cpu(device_mce,cpu));
 	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 }
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index cd8dbe57b33a..d0bd5d66e103 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -220,6 +220,10 @@ asmlinkage void mce_threshold_interrupt(void)
 			     (high & MASK_LOCKED_HI))
 				continue;
 
+			/* Log the machine check that caused the threshold
+			   event. */
+			do_machine_check(NULL, 0);
+
 			if (high & MASK_OVERFLOW_HI) {
 				rdmsrl(address, m.misc);
 				rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h
index 5a11146d6d9c..177e92b4019b 100644
--- a/include/asm-x86_64/mce.h
+++ b/include/asm-x86_64/mce.h
@@ -103,6 +103,8 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
 
 extern atomic_t mce_entry;
 
+extern void do_machine_check(struct pt_regs *, long);
+
 #endif
 
 #endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3a7379aa31ca..796276141e51 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -217,7 +217,10 @@ static int wait_for_helper(void *data)
 			sub_info->retval = ret;
 	}
 
-	complete(sub_info->complete);
+	if (sub_info->wait < 0)
+		kfree(sub_info);
+	else
+		complete(sub_info->complete);
 	return 0;
 }
 
@@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work)
 		pid = kernel_thread(____call_usermodehelper, sub_info,
 				    CLONE_VFORK | SIGCHLD);
 
+	if (wait < 0)
+		return;
+
 	if (pid < 0) {
 		sub_info->retval = pid;
 		complete(sub_info->complete);
@@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work)
  * @envp: null-terminated environment list
  * @session_keyring: session keyring for process (NULL for an empty keyring)
  * @wait: wait for the application to finish and return status.
+ *        when -1 don't wait at all, but you get no useful error back when
+ *        the program couldn't be exec'ed. This makes it safe to call
+ *        from interrupt context.
  *
  * Runs a user-space application.  The application is started
  * asynchronously if wait is not set, and runs as a child of keventd.
@@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 			     struct key *session_keyring, int wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
-	struct subprocess_info sub_info = {
-		.work		= __WORK_INITIALIZER(sub_info.work,
-						     __call_usermodehelper),
-		.complete	= &done,
-		.path		= path,
-		.argv		= argv,
-		.envp		= envp,
-		.ring		= session_keyring,
-		.wait		= wait,
-		.retval		= 0,
-	};
+	struct subprocess_info *sub_info;
+	int retval;
 
 	if (!khelper_wq)
 		return -EBUSY;
@@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 	if (path[0] == '\0')
 		return 0;
 
-	queue_work(khelper_wq, &sub_info.work);
+	sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+	if (!sub_info)
+		return -ENOMEM;
+
+	INIT_WORK(&sub_info->work, __call_usermodehelper);
+	sub_info->complete = &done;
+	sub_info->path = path;
+	sub_info->argv = argv;
+	sub_info->envp = envp;
+	sub_info->ring = session_keyring;
+	sub_info->wait = wait;
+
+	queue_work(khelper_wq, &sub_info->work);
+	if (wait < 0) /* task has freed sub_info */
+		return 0;
 	wait_for_completion(&done);
-	return sub_info.retval;
+	retval = sub_info->retval;
+	kfree(sub_info);
+	return retval;
 }
 EXPORT_SYMBOL(call_usermodehelper_keys);
 
-- 
cgit v1.2.3


From 2f7a2a79c3ebb44f8b1b7d9b4fd3a650eb69e544 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Tue, 13 Feb 2007 13:26:24 +0100
Subject: [PATCH] x86-64: - Ignore long SMI interrupts in clock calibration
 code - update 1

Add failsafe mechanism to HPET/TSC clock calibration.

	Signed-off-by: Jack Steiner <steiner@sgi.com>

Updated to include failsafe mechanism & additional community feedback.
Patch built on latest 2.6.20-rc4-mm1 tree.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/time.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 335cc91c49b7..9c7fba30dac1 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -657,6 +657,7 @@ core_initcall(cpufreq_tsc);
 
 #define TICK_COUNT 100000000
 #define TICK_MIN   5000
+#define MAX_READ_RETRIES 5
 
 /*
  * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
@@ -664,13 +665,17 @@ core_initcall(cpufreq_tsc);
  */
 static void __init read_hpet_tsc(int *hpet, int *tsc)
 {
-	int tsc1, tsc2, hpet1;
+	int tsc1, tsc2, hpet1, retries = 0;
+	static int msg;
 
 	do {
 		tsc1 = get_cycles_sync();
 		hpet1 = hpet_readl(HPET_COUNTER);
 		tsc2 = get_cycles_sync();
-	} while (tsc2 - tsc1 > TICK_MIN);
+	} while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES);
+	if (retries >= MAX_READ_RETRIES && !msg++)
+		printk(KERN_WARNING
+		       "hpet.c: exceeded max retries to read HPET & TSC\n");
 	*hpet = hpet1;
 	*tsc = tsc2;
 }
-- 
cgit v1.2.3


From f49481bc50fce428521497977861b8115666dbe7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 13 Feb 2007 13:26:24 +0100
Subject: [PATCH] x86-64: Check return value of putreg in PTRACE_SETREGS

This means if an illegal value is set for the segment registers there
ptrace will error out now with an errno instead of silently ignoring
it.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/ptrace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index addc14af0c56..4326a690a509 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -536,8 +536,12 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		}
 		ret = 0;
 		for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
-			ret |= __get_user(tmp, (unsigned long __user *) data);
-			putreg(child, ui, tmp);
+			ret = __get_user(tmp, (unsigned long __user *) data);
+			if (ret)
+				break;
+			ret = putreg(child, ui, tmp);
+			if (ret)
+				break;
 			data += sizeof(long);
 		}
 		break;
-- 
cgit v1.2.3


From 9a11ff68273f440b1d33fcc4d550ffc881e6a0b4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 13 Feb 2007 13:26:24 +0100
Subject: [PATCH] x86-64: Unexport __supported_pte_mask

The symbol is needed to manipulate page tables, and modules shouldn't
do that.

Leftover from 2.4, but no in tree module should need it now.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/setup64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 8c4b80fe71a1..6a70b55f719d 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -37,7 +37,6 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
 
 unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL(__supported_pte_mask);
 static int do_not_nx __cpuinitdata = 0;
 
 /* noexec=on|off
-- 
cgit v1.2.3


From ffb6017563aa15f9a8cff9a30b861d42c2695894 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Tue, 13 Feb 2007 13:26:24 +0100
Subject: [PATCH] x86-64: x86_64 - Fix FS/GS registers for VT execution

Initialize FS and GS to __KERNEL_DS as well.  The actual value of them is not
important, but it is important to reload them in protected mode.  At this time,
they still retain the real mode values from initial boot.  VT disallows
execution of code under such conditions, which means hardware virtualization
can not be used to boot the kernel on Intel platforms, making the boot time
painfully slow.

This requires moving the GS load before the load of GS_BASE, so just move
all the segments loads there to keep them together in the code.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/head.S | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 1e6f80870679..598a4d0351fc 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -163,6 +163,20 @@ startup_64:
 	 */
 	lgdt	cpu_gdt_descr
 
+	/* set up data segments. actually 0 would do too */
+	movl $__KERNEL_DS,%eax
+	movl %eax,%ds
+	movl %eax,%ss
+	movl %eax,%es
+
+	/*
+	 * We don't really need to load %fs or %gs, but load them anyway
+	 * to kill any stale realmode selectors.  This allows execution
+	 * under VT hardware.
+	 */
+	movl %eax,%fs
+	movl %eax,%gs
+
 	/* 
 	 * Setup up a dummy PDA. this is just for some early bootup code
 	 * that does in_interrupt() 
@@ -173,12 +187,6 @@ startup_64:
 	shrq	$32,%rdx
 	wrmsr	
 
-	/* set up data segments. actually 0 would do too */
-	movl $__KERNEL_DS,%eax
-	movl %eax,%ds	
-	movl %eax,%ss
-	movl %eax,%es
-			
 	/* esi is pointer to real mode structure with interesting info.
 	   pass it to C */
 	movl	%esi, %edi
-- 
cgit v1.2.3


From 00edefae050c2c2d1e26fa9984f8f529fbc45989 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 13 Feb 2007 13:26:24 +0100
Subject: [PATCH] x86-64: Fix off by one error in IOMMU boundary checking

Should be harmless because there is normally no memory there, but
technically it was incorrect.

Pointed out by Leo Duran

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/pci-gart.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index fc1960f1f243..030eb3753358 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -185,7 +185,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
 static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
 { 
 	u64 mask = *dev->dma_mask;
-	int high = addr + size >= mask;
+	int high = addr + size > mask;
 	int mmu = high;
 	if (force_iommu) 
 		mmu = 1; 
@@ -195,7 +195,7 @@ static inline int need_iommu(struct device *dev, unsigned long addr, size_t size
 static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
 { 
 	u64 mask = *dev->dma_mask;
-	int high = addr + size >= mask;
+	int high = addr + size > mask;
 	int mmu = high;
 	return mmu; 
 }
-- 
cgit v1.2.3


From fc986db4fc1e773e240a19bc8b407ead88982cea Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 13 Feb 2007 13:26:24 +0100
Subject: [PATCH] x86-64: Don't reserve ROMs

We trust the e820 table, so explicitely reserving ROMs shouldn't
be needed.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/setup.c | 130 +--------------------------------------------
 1 file changed, 2 insertions(+), 128 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index f330f8285499..e82c1a155af5 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -138,128 +138,6 @@ struct resource code_resource = {
 	.flags = IORESOURCE_RAM,
 };
 
-#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
-
-static struct resource system_rom_resource = {
-	.name = "System ROM",
-	.start = 0xf0000,
-	.end = 0xfffff,
-	.flags = IORESOURCE_ROM,
-};
-
-static struct resource extension_rom_resource = {
-	.name = "Extension ROM",
-	.start = 0xe0000,
-	.end = 0xeffff,
-	.flags = IORESOURCE_ROM,
-};
-
-static struct resource adapter_rom_resources[] = {
-	{ .name = "Adapter ROM", .start = 0xc8000, .end = 0,
-		.flags = IORESOURCE_ROM },
-	{ .name = "Adapter ROM", .start = 0, .end = 0,
-		.flags = IORESOURCE_ROM },
-	{ .name = "Adapter ROM", .start = 0, .end = 0,
-		.flags = IORESOURCE_ROM },
-	{ .name = "Adapter ROM", .start = 0, .end = 0,
-		.flags = IORESOURCE_ROM },
-	{ .name = "Adapter ROM", .start = 0, .end = 0,
-		.flags = IORESOURCE_ROM },
-	{ .name = "Adapter ROM", .start = 0, .end = 0,
-		.flags = IORESOURCE_ROM }
-};
-
-static struct resource video_rom_resource = {
-	.name = "Video ROM",
-	.start = 0xc0000,
-	.end = 0xc7fff,
-	.flags = IORESOURCE_ROM,
-};
-
-static struct resource video_ram_resource = {
-	.name = "Video RAM area",
-	.start = 0xa0000,
-	.end = 0xbffff,
-	.flags = IORESOURCE_RAM,
-};
-
-#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
-
-static int __init romchecksum(unsigned char *rom, unsigned long length)
-{
-	unsigned char *p, sum = 0;
-
-	for (p = rom; p < rom + length; p++)
-		sum += *p;
-	return sum == 0;
-}
-
-static void __init probe_roms(void)
-{
-	unsigned long start, length, upper;
-	unsigned char *rom;
-	int	      i;
-
-	/* video rom */
-	upper = adapter_rom_resources[0].start;
-	for (start = video_rom_resource.start; start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		video_rom_resource.start = start;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = rom[2] * 512;
-
-		/* if checksum okay, trust length byte */
-		if (length && romchecksum(rom, length))
-			video_rom_resource.end = start + length - 1;
-
-		request_resource(&iomem_resource, &video_rom_resource);
-		break;
-			}
-
-	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
-	if (start < upper)
-		start = upper;
-
-	/* system rom */
-	request_resource(&iomem_resource, &system_rom_resource);
-	upper = system_rom_resource.start;
-
-	/* check for extension rom (ignore length byte!) */
-	rom = isa_bus_to_virt(extension_rom_resource.start);
-	if (romsignature(rom)) {
-		length = extension_rom_resource.end - extension_rom_resource.start + 1;
-		if (romchecksum(rom, length)) {
-			request_resource(&iomem_resource, &extension_rom_resource);
-			upper = extension_rom_resource.start;
-		}
-	}
-
-	/* check for adapter roms on 2k boundaries */
-	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
-	     start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = rom[2] * 512;
-
-		/* but accept any length that fits if checksum okay */
-		if (!length || start + length > upper || !romchecksum(rom, length))
-			continue;
-
-		adapter_rom_resources[i].start = start;
-		adapter_rom_resources[i].end = start + length - 1;
-		request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
-		start = adapter_rom_resources[i++].end & ~2047UL;
-	}
-}
-
 #ifdef CONFIG_PROC_VMCORE
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel. This option will be passed
@@ -524,15 +402,11 @@ void __init setup_arch(char **cmdline_p)
 	init_apic_mappings();
 
 	/*
-	 * Request address space for all standard RAM and ROM resources
-	 * and also for regions reported as reserved by the e820.
-	 */
-	probe_roms();
+	 * We trust e820 completely. No explicit ROM probing in memory.
+ 	 */
 	e820_reserve_resources(); 
 	e820_mark_nosave_regions();
 
-	request_resource(&iomem_resource, &video_ram_resource);
-
 	{
 	unsigned i;
 	/* request I/O space for devices used on all i[345]86 PCs */
-- 
cgit v1.2.3


From 310adfdd9153f6ae818981a38a48dd2330990d8d Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Tue, 13 Feb 2007 13:26:24 +0100
Subject: [PATCH] x86-64: robustify bad_dma_address handling

- set bad_dma_address explicitly to 0x0
- reserve 32 pages from bad_dma_address and up
- WARN_ON() a driver feeding us bad_dma_address

Thanks to Leo Duran <leo.duran@amd.com> for the suggestion.

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Leo Duran <leo.duran@amd.com>
Cc: Job Mason <jdmason@kudzu.us>
---
 arch/x86_64/kernel/pci-calgary.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 3d65b1d4c2b3..04480c3b68f5 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -138,6 +138,8 @@ static const unsigned long phb_debug_offsets[] = {
 
 #define PHB_DEBUG_STUFF_OFFSET	0x0020
 
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
 unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
 static int translate_empty_slots __read_mostly = 0;
 static int calgary_detected __read_mostly = 0;
@@ -296,6 +298,16 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 {
 	unsigned long entry;
 	unsigned long badbit;
+	unsigned long badend;
+
+	/* were we called with bad_dma_address? */
+	badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
+	if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
+		printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
+		       "address 0x%Lx\n", dma_addr);
+		WARN_ON(1);
+		return;
+	}
 
 	entry = dma_addr >> PAGE_SHIFT;
 
@@ -656,8 +668,8 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
 	u64 start;
 	struct iommu_table *tbl = dev->sysdata;
 
-	/* reserve bad_dma_address in case it's a legal address */
-	iommu_range_reserve(tbl, bad_dma_address, 1);
+	/* reserve EMERGENCY_PAGES from bad_dma_address and up */
+	iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
 
 	/* avoid the BIOS/VGA first 640KB-1MB region */
 	start = (640 * 1024);
@@ -1176,6 +1188,7 @@ int __init calgary_iommu_init(void)
 	}
 
 	force_iommu = 1;
+	bad_dma_address = 0x0;
 	dma_ops = &calgary_dma_ops;
 
 	return 0;
-- 
cgit v1.2.3


From 5d0e600d903caa09e790824cc5812f0d97113b23 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 13 Feb 2007 13:26:24 +0100
Subject: [PATCH] x86: fix laptop bootup hang in init_acpi()

During kernel bootup, a new T60 laptop (CoreDuo, 32-bit) hangs about
10%-20% of the time in acpi_init():

 Calling initcall 0xc055ce1a: topology_init+0x0/0x2f()
 Calling initcall 0xc055d75e: mtrr_init_finialize+0x0/0x2c()
 Calling initcall 0xc05664f3: param_sysfs_init+0x0/0x175()
 Calling initcall 0xc014cb65: pm_sysrq_init+0x0/0x17()
 Calling initcall 0xc0569f99: init_bio+0x0/0xf4()
 Calling initcall 0xc056b865: genhd_device_init+0x0/0x50()
 Calling initcall 0xc056c4bd: fbmem_init+0x0/0x87()
 Calling initcall 0xc056dd74: acpi_init+0x0/0x1ee()

It's a hard hang that not even an NMI could punch through!  Frustratingly,
adding printks or function tracing to the ACPI code made the hangs go away
...

After some time an additional detail emerged: disabling the NMI watchdog
made these occasional hangs go away.

So i spent the better part of today trying to debug this and trying out
various theories when i finally found the likely reason for the hang: if
acpi_ns_initialize_devices() executes an _INI AML method and an NMI
happens to hit that AML execution in the wrong moment, the machine would
hang.  (my theory is that this must be some sort of chipset setup method
doing stores to chipset mmio registers?)

Unfortunately given the characteristics of the hang it was sheer
impossible to figure out which of the numerous AML methods is impacted
by this problem.

As a workaround i wrote an interface to disable chipset-based NMIs while
executing _INI sections - and indeed this fixed the hang.  I did a
boot-loop of 100 separate reboots and none hung - while without the patch
it would hang every 5-10 attempts.  Out of caution i did not touch the
nmi_watchdog=2 case (it's not related to the chipset anyway and didnt
hang).

I implemented this for both x86_64 and i686, tested the i686 laptop both
with nmi_watchdog=1 [which triggered the hangs] and nmi_watchdog=2, and
tested an Athlon64 box with the 64-bit kernel as well. Everything builds
and works with the patch applied.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Len Brown <lenb@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/i386/kernel/nmi.c          | 28 ++++++++++++++++++++++++++++
 arch/x86_64/kernel/nmi.c        | 27 +++++++++++++++++++++++++++
 drivers/acpi/namespace/nsinit.c |  9 +++++++++
 include/linux/nmi.h             |  9 ++++++++-
 4 files changed, 72 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 7d3f4e22d6fb..b11abacc5cfd 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -383,6 +383,34 @@ void enable_timer_nmi_watchdog(void)
 	}
 }
 
+static void __acpi_nmi_disable(void *__unused)
+{
+	apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_enable(void *__unused)
+{
+	apic_write_around(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+
 #ifdef CONFIG_PM
 
 static int nmi_pm_active; /* nmi_active before suspend */
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index e59cda134166..269fe585e71e 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -368,6 +368,33 @@ void enable_timer_nmi_watchdog(void)
 	}
 }
 
+static void __acpi_nmi_disable(void *__unused)
+{
+	apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_enable(void *__unused)
+{
+	apic_write(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
 #ifdef CONFIG_PM
 
 static int nmi_pm_active; /* nmi_active before suspend */
diff --git a/drivers/acpi/namespace/nsinit.c b/drivers/acpi/namespace/nsinit.c
index 326af8fc0ce7..33db2241044e 100644
--- a/drivers/acpi/namespace/nsinit.c
+++ b/drivers/acpi/namespace/nsinit.c
@@ -45,6 +45,7 @@
 #include <acpi/acnamesp.h>
 #include <acpi/acdispat.h>
 #include <acpi/acinterp.h>
+#include <linux/nmi.h>
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsinit")
@@ -534,7 +535,15 @@ acpi_ns_init_one_device(acpi_handle obj_handle,
 	info->parameter_type = ACPI_PARAM_ARGS;
 	info->flags = ACPI_IGNORE_RETURN_VALUE;
 
+	/*
+	 * Some hardware relies on this being executed as atomically
+	 * as possible (without an NMI being received in the middle of
+	 * this) - so disable NMIs and initialize the device:
+	 */
+	acpi_nmi_disable();
 	status = acpi_ns_evaluate(info);
+	acpi_nmi_enable();
+
 	if (ACPI_SUCCESS(status)) {
 		walk_info->num_INI++;
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index acb4ed130247..29af2d5df097 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -17,8 +17,15 @@
 #ifdef ARCH_HAS_NMI_WATCHDOG
 #include <asm/nmi.h>
 extern void touch_nmi_watchdog(void);
+extern void acpi_nmi_disable(void);
+extern void acpi_nmi_enable(void);
 #else
-# define touch_nmi_watchdog() touch_softlockup_watchdog()
+static inline void touch_nmi_watchdog(void)
+{
+	touch_softlockup_watchdog();
+}
+static inline void acpi_nmi_disable(void) { }
+static inline void acpi_nmi_enable(void) { }
 #endif
 
 #ifndef trigger_all_cpu_backtrace
-- 
cgit v1.2.3


From ee4eff6ff6cbfc8ce38131058a18802bf6206879 Mon Sep 17 00:00:00 2001
From: Benjamin Romer <benjamin.romer@unisys.com>
Date: Tue, 13 Feb 2007 13:26:25 +0100
Subject: [PATCH] x86-64: update IO-APIC dest field to 8-bit for xAPIC

On the Unisys ES7000/ONE system, we encountered a problem where performing
a kexec reboot or dump on any cell other than cell 0 causes the system
timer to stop working, resulting in a hang during timer calibration in the
new kernel.

We traced the problem to one line of code in disable_IO_APIC(), which needs
to restore the timer's IO-APIC configuration before rebooting.  The code is
currently using the 4-bit physical destination field, rather than using the
8-bit logical destination field, and it cuts off the upper 4 bits of the
timer's APIC ID.  If we change this to use the logical destination field,
the timer works and we can kexec on the upper cells.  This was tested on
two different cells (0 and 2) in an ES7000/ONE system.

For reference, the relevant Intel xAPIC spec is kept at
ftp://download.intel.com/design/chipsets/e8501/datashts/30962001.pdf,
specifically on page 334.

Signed-off-by: Benjamin M Romer <benjamin.romer@unisys.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86_64/kernel/io_apic.c | 24 +++++++++++-------------
 include/asm-x86_64/io_apic.h | 14 ++------------
 2 files changed, 13 insertions(+), 25 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 6be6730acb5c..566e64d966c4 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -831,7 +831,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.dest_mode = INT_DEST_MODE;
 	entry.mask = 0;				/* enable IRQ */
-	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
 
 	entry.trigger = irq_trigger(idx);
 	entry.polarity = irq_polarity(idx);
@@ -839,7 +839,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
 	if (irq_trigger(idx)) {
 		entry.trigger = 1;
 		entry.mask = 1;
-		entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+		entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
 	}
 
 	if (!apic && !IO_APIC_IRQ(irq))
@@ -851,7 +851,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
 		if (vector < 0)
 			return;
 
-		entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
+		entry.dest = cpu_mask_to_apicid(mask);
 		entry.vector = vector;
 
 		ioapic_register_intr(irq, vector, IOAPIC_AUTO);
@@ -920,7 +920,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 	 */
 	entry.dest_mode = INT_DEST_MODE;
 	entry.mask = 0;					/* unmask IRQ now */
-	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.polarity = 0;
 	entry.trigger = 0;
@@ -1020,18 +1020,17 @@ void __apicdebuginit print_IO_APIC(void)
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
 
-	printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
-			  " Stat Dest Deli Vect:   \n");
+	printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+			  " Stat Dmod Deli Vect:   \n");
 
 	for (i = 0; i <= reg_01.bits.entries; i++) {
 		struct IO_APIC_route_entry entry;
 
 		entry = ioapic_read_entry(apic, i);
 
-		printk(KERN_DEBUG " %02x %03X %02X  ",
+		printk(KERN_DEBUG " %02x %03X ",
 			i,
-			entry.dest.logical.logical_dest,
-			entry.dest.physical.physical_dest
+			entry.dest
 		);
 
 		printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
@@ -1293,8 +1292,7 @@ void disable_IO_APIC(void)
 		entry.dest_mode       = 0; /* Physical */
 		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
 		entry.vector          = 0;
-		entry.dest.physical.physical_dest =
-					GET_APIC_ID(apic_read(APIC_ID));
+		entry.dest          = GET_APIC_ID(apic_read(APIC_ID));
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1556,7 +1554,7 @@ static inline void unlock_ExtINT_logic(void)
 
 	entry1.dest_mode = 0;			/* physical delivery */
 	entry1.mask = 0;			/* unmask IRQ now */
-	entry1.dest.physical.physical_dest = hard_smp_processor_id();
+	entry1.dest = hard_smp_processor_id();
 	entry1.delivery_mode = dest_ExtINT;
 	entry1.polarity = entry0.polarity;
 	entry1.trigger = 0;
@@ -2131,7 +2129,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.dest_mode = INT_DEST_MODE;
-	entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
+	entry.dest = cpu_mask_to_apicid(mask);
 	entry.trigger = triggering;
 	entry.polarity = polarity;
 	entry.mask = 1;					 /* Disabled (masked) */
diff --git a/include/asm-x86_64/io_apic.h b/include/asm-x86_64/io_apic.h
index 561ecbfd4cb5..f4fb238c89f1 100644
--- a/include/asm-x86_64/io_apic.h
+++ b/include/asm-x86_64/io_apic.h
@@ -85,18 +85,8 @@ struct IO_APIC_route_entry {
 		mask		:  1,	/* 0: enabled, 1: disabled */
 		__reserved_2	: 15;
 
-	union {		struct { __u32
-					__reserved_1	: 24,
-					physical_dest	:  4,
-					__reserved_2	:  4;
-			} physical;
-
-			struct { __u32
-					__reserved_1	: 24,
-					logical_dest	:  8;
-			} logical;
-	} dest;
-
+	__u32	__reserved_3	: 24,
+		dest		:  8;
 } __attribute__ ((packed));
 
 /*
-- 
cgit v1.2.3


From 3e94fb8f54c5305ed472e0867cd67d53e05bfb64 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Tue, 13 Feb 2007 13:26:25 +0100
Subject: [PATCH] x86-64: avoid warning message livelock

I've seen my box paralyzed by an endless spew of

    rtc: lost some interrupts at 1024Hz.

messages on the serial console.  What seems to be happening is that
something real causes an interrupt to be lost and triggers the
message.  But then printing the message to the serial console (from
the hpet interrupt handler) takes more than 1/1024th of a second, and
then some more interrupts are lost, so the message triggers again....

Fix this by adding a printk_ratelimit() before printing the warning.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/time.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 9c7fba30dac1..3cc6886f1fb7 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1226,8 +1226,9 @@ static void hpet_rtc_timer_reinit(void)
 		if (PIE_on)
 			PIE_count += lost_ints;
 
-		printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
-		       hpet_rtc_int_freq);
+		if (printk_ratelimit())
+			printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+			       hpet_rtc_int_freq);
 	}
 }
 
-- 
cgit v1.2.3


From 2fb12a9bca5ad9aa6dcd2c639b4a7656a8843ef8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 13 Feb 2007 13:26:25 +0100
Subject: [PATCH] x86-64: survive having no irq mapping for a vector

Occasionally the kernel has bugs that result in no irq being found for a
given cpu vector.  If we acknowledge the irq the system has a good chance
of continuing even though we dropped an irq message.  If we continue to
simply print a message and not acknowledge the irq the system is likely to
become non-responsive shortly there after.

AK: Fixed compilation for UP kernels

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: "Luigi Genoni" <luigi.genoni@pirelli.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86_64/kernel/irq.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 0c06af6c13bc..3bc30d2c13d3 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -18,6 +18,7 @@
 #include <asm/uaccess.h>
 #include <asm/io_apic.h>
 #include <asm/idle.h>
+#include <asm/smp.h>
 
 atomic_t irq_err_count;
 
@@ -120,9 +121,14 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 
 	if (likely(irq < NR_IRQS))
 		generic_handle_irq(irq);
-	else if (printk_ratelimit())
-		printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
-			__func__, smp_processor_id(), vector);
+	else {
+		if (!disable_apic)
+			ack_APIC_irq();
+
+		if (printk_ratelimit())
+			printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
+				__func__, smp_processor_id(), vector);
+	}
 
 	irq_exit();
 
-- 
cgit v1.2.3


From f790cd30d002949a12623b2a8cec4d4e5a8887ef Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 13 Feb 2007 13:26:25 +0100
Subject: [PATCH] x86: Add new CPUID bits for AMD Family 10 CPUs in
 /proc/cpuinfo

Just various new acronyms. The new popcnt bit is in the middle
of Intel space. This looks a little weird, but I've been assured
it's ok.

Also I fixed RDTSCP for i386 which was at the wrong place.

For i386 and x86-64.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/cpu/proc.c | 14 +++++++++-----
 arch/x86_64/kernel/setup.c  | 14 ++++++++++----
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 6624d8583c42..47e3ebbfb28d 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -29,7 +29,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
-		NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
+		NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow",
 
 		/* Transmeta-defined */
 		"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -47,7 +47,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		/* Intel-defined (#2) */
 		"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
 		"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
-		NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* VIA/Cyrix/Centaur-defined */
@@ -57,8 +57,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* AMD-defined (#2) */
-		"lahf_lm", "cmp_legacy", "svm", NULL, "cr8legacy", NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		"lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm",
+		"sse4a", "misalignsse",
+		"3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 	};
@@ -69,8 +70,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		"ttp",  /* thermal trip */
 		"tm",
 		"stc",
+		"100mhzsteps",
+		"hwpstate",
 		NULL,
-		/* nothing */	/* constant_tsc - moved to flags */
+		NULL,	/* constant_tsc - moved to flags */
+		/* nothing */
 	};
 	struct cpuinfo_x86 *c = v;
 	int i, n = c - cpu_data;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index e82c1a155af5..13daaf359d21 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -942,7 +942,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
-		NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
+		NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
+		"3dnowext", "3dnow",
 
 		/* Transmeta-defined */
 		"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -960,7 +961,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		/* Intel-defined (#2) */
 		"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
 		"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
-		NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* VIA/Cyrix/Centaur-defined */
@@ -970,8 +971,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* AMD-defined (#2) */
-		"lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		"lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
+		"altmovcr8", "abm", "sse4a",
+		"misalignsse", "3dnowprefetch",
+		"osvw", "ibs", NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 	};
@@ -982,6 +985,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		"ttp",  /* thermal trip */
 		"tm",
 		"stc",
+		"100mhzsteps",
+		"hwpstate",
+		NULL,	/* tsc invariant mapped to constant_tsc */
 		NULL,
 		/* nothing */	/* constant_tsc - moved to flags */
 	};
-- 
cgit v1.2.3


From 0a4599c894d880763eec6cb93f6c246dac6c3269 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 13 Feb 2007 13:26:25 +0100
Subject: [PATCH] x86: Enable NMI watchdog for AMD Family 0x10 CPUs

For i386/x86-64.

Straight forward -- just reuse the Family 0xf code.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/nmi.c   | 6 ++++--
 arch/x86_64/kernel/nmi.c | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index b11abacc5cfd..5d8a07c20281 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -185,7 +185,8 @@ static __cpuinit inline int nmi_known_cpu(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
-		return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
+		return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
+			|| (boot_cpu_data.x86 == 16));
 	case X86_VENDOR_INTEL:
 		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 			return 1;
@@ -817,7 +818,8 @@ void setup_apic_nmi_watchdog (void *unused)
 	if (nmi_watchdog == NMI_LOCAL_APIC) {
 		switch (boot_cpu_data.x86_vendor) {
 		case X86_VENDOR_AMD:
-			if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
+			if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
+				boot_cpu_data.x86 != 16)
 				return;
 			if (!setup_k7_watchdog())
 				return;
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 269fe585e71e..486f4c61a948 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -172,7 +172,7 @@ static __cpuinit inline int nmi_known_cpu(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
-		return boot_cpu_data.x86 == 15;
+		return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16;
 	case X86_VENDOR_INTEL:
 		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 			return 1;
-- 
cgit v1.2.3


From 62cc49396e593dd71c6595302bb10b085aefbfa5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 13 Feb 2007 13:26:26 +0100
Subject: [PATCH] x86: Unify pcspeaker platform device code between i386/x86-64

Trivial cleanup.

Only change is that it is always compiled in now on x86-64 like on i386.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/Makefile    |  1 +
 arch/i386/kernel/pcspeaker.c | 20 ++++++++++++++++++++
 arch/i386/kernel/setup.c     | 26 --------------------------
 arch/x86_64/kernel/Makefile  |  2 ++
 arch/x86_64/kernel/setup.c   | 20 --------------------
 5 files changed, 23 insertions(+), 46 deletions(-)
 create mode 100644 arch/i386/kernel/pcspeaker.c

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 223969377de4..cbe4e601885c 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_K8_NB)		+= k8.o
 
 obj-$(CONFIG_VMI)		+= vmi.o vmitime.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
+obj-y				+= pcspeaker.o
 
 EXTRA_AFLAGS   := -traditional
 
diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c
new file mode 100644
index 000000000000..bc1f2d3ea277
--- /dev/null
+++ b/arch/i386/kernel/pcspeaker.c
@@ -0,0 +1,20 @@
+#include <linux/platform_device.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+static __init int add_pcspkr(void)
+{
+	struct platform_device *pd;
+	int ret;
+
+	pd = platform_device_alloc("pcspkr", -1);
+	if (!pd)
+		return -ENOMEM;
+
+	ret = platform_device_add(pd);
+	if (ret)
+		platform_device_put(pd);
+
+	return ret;
+}
+device_initcall(add_pcspkr);
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index bd8c218d94af..122623dcc6e1 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -33,7 +33,6 @@
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
 #include <linux/seq_file.h>
-#include <linux/platform_device.h>
 #include <linux/console.h>
 #include <linux/mca.h>
 #include <linux/root_dev.h>
@@ -660,28 +659,3 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	tsc_init();
 }
-
-static __init int add_pcspkr(void)
-{
-	struct platform_device *pd;
-	int ret;
-
-	pd = platform_device_alloc("pcspkr", -1);
-	if (!pd)
-		return -ENOMEM;
-
-	ret = platform_device_add(pd);
-	if (ret)
-		platform_device_put(pd);
-
-	return ret;
-}
-device_initcall(add_pcspkr);
-
-/*
- * Local Variables:
- * mode:c
- * c-file-style:"k&r"
- * c-basic-offset:8
- * End:
- */
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 3c7cbff04d3d..ae399458024b 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_PCI)		+= early-quirks.o
 
 obj-y				+= topology.o
 obj-y				+= intel_cacheinfo.o
+obj-y				+= pcspeaker.o
 
 CFLAGS_vsyscall.o		:= $(PROFILING) -g0
 
@@ -56,3 +57,4 @@ quirks-y			+= ../../i386/kernel/quirks.o
 i8237-y				+= ../../i386/kernel/i8237.o
 msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
 alternative-y			+= ../../i386/kernel/alternative.o
+pcspeaker-y			+= ../../i386/kernel/pcspeaker.o
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 13daaf359d21..3d98b696881d 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -1104,23 +1104,3 @@ struct seq_operations cpuinfo_op = {
 	.stop =	c_stop,
 	.show =	show_cpuinfo,
 };
-
-#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
-#include <linux/platform_device.h>
-static __init int add_pcspkr(void)
-{
-	struct platform_device *pd;
-	int ret;
-
-	pd = platform_device_alloc("pcspkr", -1);
-	if (!pd)
-		return -ENOMEM;
-
-	ret = platform_device_add(pd);
-	if (ret)
-		platform_device_put(pd);
-
-	return ret;
-}
-device_initcall(add_pcspkr);
-#endif
-- 
cgit v1.2.3